Пример #1
0
def identify_otus_unnamed(seq_file, cluster_similarity):
    """
    Generates sequence ids in a cluster

    Identify sequences in OTUs at the given cluster similarity;
    """
    logging.info("Running UCLUST on unnamed sequences at %f", cluster_similarity)
    with util.ntf(prefix="uclust") as tf:
        # Sort and cluster
        uclust.cluster(seq_file, tf.name, pct_id=cluster_similarity, quiet=True)
        clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf))
        for _, sequences in clusters:
            yield [i.query_label for i in sequences]
Пример #2
0
def identify_otus_unnamed(seq_file, cluster_similarity):
    """
    Generates sequence ids in a cluster

    Identify sequences in OTUs at the given cluster similarity;
    """
    logging.info('Running UCLUST on unnamed sequences at %f',
                 cluster_similarity)
    with util.ntf(prefix='uclust') as tf:
        # Sort and cluster
        uclust.cluster(
            seq_file, tf.name, pct_id=cluster_similarity, quiet=True)
        clusters = uclust.sequences_by_cluster(uclust.parse_uclust_out(tf))
        for _, sequences in clusters:
            yield [i.query_label for i in sequences]
Пример #3
0
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None):
    with util.ntf(prefix=prefix, suffix='.fasta') as fa, \
            util.ntf(prefix=prefix, suffix='.uc') as uc:
        wrap.esl_sfetch(seqfile, seqnames, fa)
        fa.flush()
        uclust.cluster(fa.name,
                       uc.name,
                       pct_id=identity,
                       pre_sorted=False,
                       quiet=True,
                       threads=threads)
        df = uclust.parse_uclust_as_df(uc)
        df = df[df.type != 'C']
        df = df[['type', 'query_label', 'target_label']]

        return df
Пример #4
0
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None):
    prefix = prefix.replace('/', '\\')  # / confuses the filesystem
    with util.ntf(prefix=prefix, suffix='.fasta') as fa, \
            util.ntf(prefix=prefix, suffix='.uc') as uc:
        wrap.esl_sfetch(seqfile, seqnames, fa)
        fa.flush()
        uclust.cluster(fa.name,
                       uc.name,
                       pct_id=identity,
                       pre_sorted=False,
                       quiet=True,
                       threads=threads)
        df = uclust.parse_uclust_as_df(uc)
        df = df[df.type != 'C']
        df = df[['type', 'query_label', 'target_label']]

        return df