def dereplicate(fasta): ''' Process the input fasta entries. Populate a dictionary {seq => {sample => counts}}. Use the special sample key None to indicate the sum of all counts for that sequence. ''' counts = {} provenances = {} for record in SeqIO.parse(fasta, 'fasta'): seq = str(record.seq) sample = util.strip_fasta_label(record.id) if seq not in counts: counts[seq] = 1 else: counts[seq] += 1 if seq not in provenances: provenances[seq] = {sample: 1} elif sample not in provenances[seq]: provenances[seq][sample] = 1 else: provenances[seq][sample] += 1 return counts, provenances
def process_line(line): fields = line.rstrip().split('\t') query = util.strip_fasta_label(fields[0]) target = fields[1] if target != no_hit: write_out(query, target) elif target == no_hit and save_no_hit: write_out(query, 'no_hit')
def process_line(line): fields = line.rstrip().split('\t') seq = util.strip_fasta_label(fields[0]) hit_type = fields[1] if hit_type == 'otu' and fields[2] == '*': otu = fields[4] elif hit_type == 'otu' and fields[2] != '*': otu = fields[5] elif hit_type == 'match': otu = fields[4] if hit_type != 'chimera': output.write("{}: {}\n".format(seq, otu))
def b6_to_dict(b6, no_hit="*", save_no_hit=True): '''from a blast6 mapping file, make a dict {query => target}''' membership = {} with open(b6) as f: for line in f: fields = line.split() query = util.strip_fasta_label(fields[0]) target = fields[1] if target != no_hit or save_no_hit: membership[query] = target return membership
def up_to_dict(up, keep_chimera=False): '''from a uparse mapping file, make a dict {seq => otu}''' membership = {} with open(up) as f: for line in f: fields = line.split() seq = util.strip_fasta_label(fields[0]) hit_type = fields[1] otu = fields[4] if hit_type == 'chimera': if keep_chimera: otu = 'chimera' else: # ignore this entry continue membership[seq] = otu return membership
def b6_line_to_query_hit(line): query, hit = line.rstrip().split("\t")[0:2] query = util.strip_fasta_label(query) return query, hit