def identify_otus_unnamed(seq_file, cluster_similarity): """ Generates sequence ids in a cluster Identify sequences in OTUs at the given cluster similarity; """ logging.info("Running UCLUST on unnamed sequences at %f", cluster_similarity) with util.ntf(prefix="uclust") as tf: # Sort and cluster uclust.cluster(seq_file, tf.name, pct_id=cluster_similarity, quiet=True) clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf)) for _, sequences in clusters: yield [i.query_label for i in sequences]
def identify_otus_unnamed(seq_file, cluster_similarity): """ Generates sequence ids in a cluster Identify sequences in OTUs at the given cluster similarity; """ logging.info('Running UCLUST on unnamed sequences at %f', cluster_similarity) with util.ntf(prefix='uclust') as tf: # Sort and cluster uclust.cluster( seq_file, tf.name, pct_id=cluster_similarity, quiet=True) clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf)) for _, sequences in clusters: yield [i.query_label for i in sequences]
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None): with util.ntf(prefix=prefix, suffix='.fasta') as fa, \ util.ntf(prefix=prefix, suffix='.uc') as uc: wrap.esl_sfetch(seqfile, seqnames, fa) fa.flush() uclust.cluster(fa.name, uc.name, pct_id=identity, pre_sorted=False, quiet=True, threads=threads) df = uclust.parse_uclust_as_df(uc) df = df[df.type != 'C'] df = df[['type', 'query_label', 'target_label']] return df
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None): prefix = prefix.replace('/', '\\') # / confuses the filesystem with util.ntf(prefix=prefix, suffix='.fasta') as fa, \ util.ntf(prefix=prefix, suffix='.uc') as uc: wrap.esl_sfetch(seqfile, seqnames, fa) fa.flush() uclust.cluster(fa.name, uc.name, pct_id=identity, pre_sorted=False, quiet=True, threads=threads) df = uclust.parse_uclust_as_df(uc) df = df[df.type != 'C'] df = df[['type', 'query_label', 'target_label']] return df