def main(argv): fn_list = argv[0] seqdir = argv[1] seqfile = open(argv[2] + '.fa', 'wb') setname = seqdir.split('_')[1] sys.stderr.write('%s> Reading sets...\n' % get_timestamp()) seq_d = {'U': read_seqs(os.path.join(seqdir, '%s_unpaired_all.fastq' % setname)), 'R1': read_seqs(os.path.join(seqdir, '%s_R1_paired.fastq' % setname)), 'R2': read_seqs(os.path.join(seqdir, '%s_R2_paired.fastq' % setname))} sys.stderr.write('%s> Checking for duplicates...\n' % get_timestamp()) # seqfile = open(os.path.basename(fn_list) + '.fa', 'wb') drf = DuplicateReadFinder() for line in open(fn_list): line = line.strip() id_, sources = line.split('\t') sources = sources.split(',') for k in sources: if len(sources) == 2: kk = 'p' elif sources[0].startswith('R'): kk = 'u' else: kk = '' if id_ in seq_d[k]: drf.add_sequence(id_ + '_' + k + kk, seq_d[k][id_]) # seqfile.write('>%s_%s\n%s\n' % (id_, k, seq_d[k][id_])) sys.stderr.write('%s> Writing sequences...\n' % get_timestamp()) drf.write_sequences(seqfile, modify_output=lambda x:x.replace('U', 'T')) seqfile.close() sys.stderr.write('%s> Done.\n' % get_timestamp()) pass
def main(argv): fn_list = argv[0] seqdir = argv[1] seqfile = open(argv[2] + '.fa', 'wb') drf = DuplicateReadFinder() wanted = dict() for line in open(fn_list): line = line.strip() id_, sources = line.split('\t') sources = sources.split(',') if len(sources) == 2: kk = 'p' elif sources[0].startswith('R'): kk = 'u' else: kk = '' for k in sources: wanted[id_] = '%s_%s%s' % (id_, k, kk) setname = seqdir.split('_')[1] sys.stderr.write('%s> Reading sequences and checking for duplicates...\n' % get_timestamp()) for fn in ['%s_unpaired_all.fastq', '%s_R1_paired.fastq', '%s_R2_paired.fastq']: read_seqs(os.path.join(seqdir, fn % setname), drf, wanted) sys.stderr.write('%s> Writing sequences...\n' % get_timestamp()) drf.write_sequences(seqfile, modify_output=lambda x:x.replace('U', 'T')) seqfile.close() sys.stderr.write('%s> Done.\n' % get_timestamp()) pass