def combine_with_hits(clean, s_db, search_out, hits): """ combine sequences with best hits from search """ best = set([hit[1].split()[0] for hit in numblast(open(search_out), hits, False, False)]) combo = '%s.best%srefs.fa' % (clean.rsplit('.', 1)[0], hits) if os.path.exists(combo) is True: return combo combo = open(combo, 'w') for seq in parse_fasta(clean): print >> combo, '\n'.join(seq) # create/open tch for search database s_tch = '%s.tch' % (s_db) if os.path.exists(s_tch) is False: fasta2tch(s_db) id2seq = hash.Hash() id2seq.open(s_tch) # get sequences for best hits from tch for hit in best: seq = id2seq[hit].split('\n') header = remove_char(seq[0].split()[0]).replace('>', '>best-hit_') print >> combo, '\n'.join([header, seq[1].upper()]) combo.close() id2seq.close() return combo.name
def find_16S(fastas, hmms, bit_thresh=float(20), length_thresh=500, masking=True, buffer=0): """ 1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!') seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...] 3) identify regions that match to 16S (for best model) 4) mask internal regions that do not align to model 5) length threshold applies to aligned regions of 16S sequence 5) export 16S sequnece based on complete gene (including masked insertions) """ # identify start/stop positions # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps] group2hmm = find_coordinates(hmms, bit_thresh) # get sequences from fasta file for fasta in fastas: for seq in parse_fasta(fasta): id = seq[0].split('>')[1].split()[0] if id not in group2hmm: continue seq[1] = seq[1].upper() count = 0 # how many 16S genes are there on the contig? for group, info in list(group2hmm[id].items()): model, strand, coords, matches, gaps = info # count insertion bases (ib) from gaps ib = sum([i[1] - i[0] + 1 for i in gaps]) # calcualte length of non-insertion regions (don't include buffer) tl = coords[1] - coords[0] + 1 length = tl - ib if length < length_thresh: continue # count sequence count += 1 # set retrieval coords based on buffer ret_coords = [max([coords[0] - buffer, 1]), \ min([coords[1] + buffer, len(seq[1])]), coords[2]] buffer_ends = check_buffer(coords, len(seq[1]), buffer) # mask insertion sequences if masking is True: seq[1] = mask_sequence(seq[1], gaps) S = seq[1][(ret_coords[0] - 1):(ret_coords[1])] inserts = [gap[1] - gap[0] + 1 for gap in gaps] inserts.append('end') model_pos = ';'.join([ '%s-%s(%s)' % (match[2], match[3], insert) for match, insert in zip(matches, inserts) ]) header = '%s 16SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 16S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \ (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib) # reverse complement if strand is reverse if strand == '-': S = rc(['', S])[1] yield [header, S]
def headerid2desc(fasta, subset=False): db = {} for seq in parse_fasta(fasta): header = seq[0].split('>')[1] id, desc = header.split()[0], '%s len:%s' % (header, len(seq[1])) db[id] = desc if subset == False: db[id] = desc elif id in subset: db[id] = desc return db
def strip_masked(fasta, min_len, print_masked): """ remove masked regions from fasta file as long as they are longer than min_len """ for seq in parse_fasta(fasta): nm, masked = parse_masked(seq, min_len) nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)] yield [0, nm] if print_masked is True: for i, m in enumerate([i for i in masked if i != []], 1): m = ['%s insertion:%s' % (seq[0], i), ''.join(m)] yield [1, m]
def check_type(fasta): nucl = ['A', 'T', 'G', 'C'] junk = ['N', 'U', '.', '-', ' '] type = 'nucl' for seq in parse_fasta(fasta): seq = seq[1].upper() for residue in seq: if residue in junk: continue if residue not in nucl: type = 'prot' break break return type
def ko2kegg(file, option, file_type): tch = option2kegg(option) kegg = hash.Hash() kegg.open(tch) if file_type == 'fasta': for sequence in parse_fasta(file): header = sequence[0].split('>')[1] id = header.split()[0] yield header ks = set(find_ko(header.split())) for k in ks: if k in kegg: for function in kegg[k].split('|'): # - id - k - function yield '\t%s\t%s\t%s' % (id, k, function) else: yield '\t%s\t%s\tn/a' % (id, k) elif file_type == 'list': for line in file: line = line.strip() if len(line.split()) != 0: id = line.split()[0] yield line ks = set(find_ko(line.split())) for k in ks: if k in kegg: for function in kegg[k].split('|'): # - id - k - function yield '\t%s\t%s\t%s' % (id, k, function) else: yield '\t%s\t%s\tn/a' % (id, k) else: ks = set(find_ko(file)) for k in ks: if k in kegg: for function in kegg[k].split('|'): yield [k, function] else: yield [k, 'n/a'] kegg.close()
def de_rep(fastas, append_index, return_original = False): """ de-replicate fastas based on sequence names """ ids = [] for fasta in fastas: for seq in parse_fasta(fasta): header = seq[0].split('>')[1].split() id = header[0] if id not in ids: ids.append(id) if return_original is True: yield [header, seq] else: yield seq elif append_index == True: new, ids = append_index_id(id, ids) if return_original is True: yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]] else: yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]