def test_pcdhit(): import os import lilbio import pcdhit from lilbio.funcs import uppercase_only source = os.path.join(tests_dir(), alignment_file) records = lilbio.parse(source, 'stockholm', func=uppercase_only) filtered_records = pcdhit.filter(records, 0.7) assert len(list(filtered_records)) == 61
def filter(records, threshold): """Filter non-redundant records via cd-hit. cdhit: http://weizhongli-lab.org/cd-hit/ cdhit will cluster sequences that meet a similarity threshold and return a representative record for each cluster: cdhit -i <fin> -c <threshold> -o <fout> Parameters ---------- records : iterable Iterable of (header, sequence) tuples. threshold : float, optional (0.9) Sequence identity threshold (cd-hit '-c <thr>' option). Yields ------ (header, sequence) : tuple (str, str) For each non-redundant record, a tuple containing header and sequence. """ # check for cd-hit on path cdhit_exe = is_command(['cd-hit', 'cdhit']) logger.debug('cd-hit executable: %r', cdhit_exe) if cdhit_exe is None: raise CdhitNotFoundError if not 0.7 <= threshold <= 1.0: raise IdentityThresholdError # open tmp files with opentf() as fin, opentf() as fout: print_input_fasta(records, fin) call_cdhit(cdhit_exe, fin, fout, threshold) for rec in lilbio.parse(fout, fmt='fasta'): head, seq = rec[0].split('@') yield head, seq
def test_stockholm(): fname = os.path.join(tests_dir(), '1.sto') a = list(lilbio.parse(fname, 'stockholm')) assert repr(a) == RECORDS
def test_fasta(): fname = os.path.join(tests_dir(), '1.fa') a = list(lilbio.parse(fname, 'fasta')) assert repr(a) == RECORDS