def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format=='fasta': for [s,b] in util.iter_fst(index_fn): # note: I'm pretty sure this won't work for downstream, because you need # to remove the first character from sequence ID s2b[s] = b # Case 2: index file is tab-delimited elif format=='tab': for line in open(index_fn): [s,b] = line.rstrip().split() s2b[s] = b # Case 3: index file is FASTQ format elif format=='fastq': for [s,b,_,_] in util.iter_fsq(index_fn): # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0: s = s.rsplit(' ', 1)[0] s2b[s[1:]] = b return s2b
def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format == 'fasta': for [s, b] in util.iter_fst(index_fn): # note: I'm pretty sure this won't work for downstream, because you need # to remove the first character from sequence ID s2b[s] = b # Case 2: index file is tab-delimited elif format == 'tab': for line in open(index_fn): [s, b] = line.rstrip().split() s2b[s] = b # Case 3: index file is FASTQ format elif format == 'fastq': for [s, b, _, _] in util.iter_fsq(index_fn): # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0: s = s.rsplit(' ', 1)[0] s2b[s[1:]] = b return s2b
parser.add_argument('--fastq', help='FASTQ file', required=True) parser.add_argument('--blast', help='BLAST file', required=True) parser.add_argument('--sample', help='Sample name', required=True) parser.add_argument('--prefix', help='OTU prefix', default='bacteria') parser.add_argument('--out', help='Output prefix (counts)') args = parser.parse_args() # --------------- # Initialize data # --------------- print('Initializing data') # 1. Read FASTQ sequences into dictionary seqs = {} for record in util.iter_fsq(args.fastq): sid = record[0][1:] seq = record[1] seqs[sid] = seq # 2. Map: microbial contigs to GCF IDs contig2gcf = {} for line in open('/home/unix/csmillie/aviv/db/refseq/meta/contig2gcf.txt'): contig, gcf = re.sub('"', '', line).rstrip().split('\t') contig2gcf[contig] = gcf # 3. Map: GCF IDs to taxonomy gcf2sp = {} gcf2gn = {} for line in open('/home/unix/csmillie/aviv/db/refseq/meta/gcf.taxonomy_table.txt'):
parser.add_argument('--prefix', help='Prefix to add', type=str, default='') parser.add_argument('--prefix_sep', help='Prefix separator', type=str, default='.') parser.add_argument('--debug', help='Debug mode', action='store_true', default=False) args = parser.parse_args() # get iterator if args.fst: iter_seq = util.iter_fst(args.fst) elif args.fsq: iter_seq = util.iter_fsq(args.fsq) elif args.FST: iter_seq = util.iter_fst(sys.stdin) elif args.FSQ: iter_seq = util.iter_fsq(sys.stdin) else: quit('error: must specify fst, fsq, FST, or FSQ') # initialize variables keep = {} remove = {} # load IDs/coordinates to keep if args.keep: for line in open(args.keep): line = line.rstrip().split('\t')
#!/usr/bin/env python import util q = util.iter_fsq() for record in q: print '>%s\n%s' %(record[0][1:], record[1])
line = line.rstrip().split() folder = line[0] sample = line[1] info[sample] = folder # Extract info [sample, site] = args.sid.split('.') folder = info[sample] site = sites[site] # Get sequence IDs seqs = {} orgs = 'archaea bacteria fungi protozoa viral'.split() for org in orgs: fn = './%s/%s_%s_bowtie2_contam.fastq' % (org, args.sid, org) for record in util.iter_fsq(fn): if len(record) > 0: seqs[record[0][1:]] = record print 'Found %d sequences' % (len(seqs)) # Parse BAM file out = open(args.out, 'w') fn = '/home/unix/csmillie/Gut_Human/data/%s/%s/outs/possorted_genome_bam.bam' % ( folder, site) if not os.path.exists(fn): print 'BAM file not found' for line in os.popen('samtools view -f 4 %s' % (fn)): read = line.split()[0] if read in seqs: cell = ''
import argparse import util # parse args parser = argparse.ArgumentParser() parser.add_argument('-f', help='FASTA file') parser.add_argument('-q', help='FASTQ file') parser.add_argument('-s', help='Subset ids') args = parser.parse_args() # load subset subset = [line.rstrip() for line in open(args.s)] # get iterator iter_seq = '' if args.f: iter_seq = util.iter_fst(args.f) if args.q: iter_seq = util.iter_fsq(args.q) # subset file for record in iter_seq: sid = record[0][1:].split(';')[0] if sid in subset: print '\n'.join(record)