def _method_GATB(self, *args, **kwargs): with open(self.outfile, "w") as fasta: for record in Bank(self.infile): fasta.write(">{}\n{}\n".format( record.comment.decode("utf-8"), record.sequence.decode("utf-8"))) print("test")
indices = {} for index in parameters: if parameters[index]['from_end']: indices[index] = len(sequence) - parameters[index]['index'] else: indices[index] = parameters[index]['index'] return(indices) r = [] print("Reading source file into in-memory database of barcodes: {}".format(snakemake.input['fastqfile'])) con = sqlite3.connect(":memory:") cur = con.cursor() print("") cur.execute('CREATE TABLE barcodes (seqid TEXT, celbc TEXT, umi TEXT)') # reading from file and writing to database in chunks to save memory fastq_parser = Bank(snakemake.input.fastqfile[0]) for chunk in chunks(fastq_parser, 10000): r = [] for seq in chunk: sequence = seq.sequence.decode("utf-8") indices = parse_indices(sequence, config['params']['barcoding']) umi = sequence[indices["umi_start"]:indices["umi_end"]] cel = sequence[indices["cell_bc_start"]:indices["cell_bc_end"]] seqid = seq.comment.decode("utf-8").split(" ")[0] r.append((seqid, cel, umi)) cur.executemany('INSERT INTO barcodes VALUES (?,?,?)', r) r = None print("Creating index on read indentifiers") cur.execute('CREATE UNIQUE INDEX seqidx ON barcodes (seqid)') print("Writing output file: {}".format(snakemake.output))
type=str, help='<start>:<end> indices of reads to process. For debugging/batching.', default=None) args = parser.parse_args() graph_file = args.graph_file reads_file = args.reads_file out_file = args.out_file process_interval = None if args.reads_to_process is not None: begin, end = args.reads_to_process.split(':') process_interval = int(begin), int(end) bank = Bank(reads_file) print("File '%s' is of type: %s" % (bank.uri, bank.type)) graph = Graph('-in %s' % graph_file) KMER_SIZE = graph.kmerSize expected_seed_ratio = 0.8**KMER_SIZE nseqs = 0 # pickle.dump(minhash, open('minhash', 'wb')) # bktree = pickle.load(open('BKTREE_19', 'rb')) print("Got the nodes") total_path_found = 0 total_read_length = 0
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # =========================================================================== # we import pyGATB Bank from gatb import Bank # We will use a file containing some Fasta sequences F_NAME = '../thirdparty/gatb-core/gatb-core/test/db/query.fa' # We create the bank representation of the Fasta sequence file bank = Bank(F_NAME) print("File '%s' is of type: %s" % (bank.uri, bank.type)) nseqs = 0 # We iterate over some sequences. for i, seq in enumerate(bank): # 'seq' is of type 'Sequence'. # Accessing 'Sequence' internals is done as follows: # sequence header : seq.comment # sequence quality: seq.quality (Fastq only) # sequence letters: seq.sequence # sequence size : len(seq) seqid = seq.comment.decode("utf-8").split(" ")[0] if i < 5: