Exemplo n.º 1
0
def evaluate(model, corpus, ecotypes, outfile):
    '''
    A test battery:
    
    - SOMO task -- find the domain in a sequence that does not fit
    - Ecotype task -- separate niche-specific subpopulations of misc species

    Returns a list of accuracy - (sub)task pairs.
    '''
    with open(outfile, 'w+') as out:

        # (1)
        eprint('SOMO task ...')
        SOMO = odd_one(corpus, model, n_not_odd=5)
        out.write(f'{str(SOMO)}\tSOMO\n')

        # (2)
        eprint('Ecotype task ...')
        m = load_embedding(model)
        for task in ['vibrio', 'prochlorococcus', 'pseudomonas']:
            rank, eco = load_ecotypes(f'{ecotypes}/{task}.tsv')
            d = {k: v2 for k, (v1, v2) in eco.items()}
            for k, v in ecotype_task(d, m).items():
                if k != 'NA':
                    out.write(f'{v}\t{k}\n')

        eprint('Evaluation done.')
Exemplo n.º 2
0
def odd_one(fp_test_corpus, fp_model, n_not_odd=5):
    '''
    SOMO task.
    '''
    import random
    from nanotext.io import load_embedding

    model = load_embedding(fp_model)
    vocab = list(model.wv.vocab.keys())
    pos, neg = 0, 0

    with open(fp_test_corpus, 'r') as file:
        for line in file:
            genome, contig, domains = line.strip().split('\t')
            domains = domains.split(',')
            for seq in chunks(domains, n_not_odd):
                if len(seq) > 1:  # otherwise its a 50:50 coin flip
                    odd = random.choice(vocab)
                    guess = model.wv.doesnt_match(seq + [odd])
                    if guess == odd:
                        pos += 1
                    else:
                        neg += 1

    return round(pos / (pos + neg), 4)
Exemplo n.º 3
0
def evaluate(model, corpus, outfile, clusters):
    '''
    A test battery:
    
    - SOMO
    - king queen
    - various clusterings w/ associated tables
        - e coli
        - closridia
        - chlamydia
        - prochlorococcus
    
    closest genomes prochlorococcus or Tara
    '''
    results = {}

    # SOMO task -- tests word embedding quality
    eprint('SOMO task ...')
    SOMO = odd_one(corpus, model, n_not_odd=5)
    results['SOMO'] = SOMO

    # Ecotype task -- tests document embedding quality
    eprint('Ecotype task ...')
    truth, cluster_labels = [], []
    d = defaultdict(list)

    with open(clusters, 'r') as file:
        _ = next(file)  # header
        for line in file:
            row = line.strip().split('\t')
            clade = row[3]
            cluster = row[6]
            if (clade in ['HL', 'LL']) and (cluster != -1):
                d[clade].append(row[2])  # genome UID
                truth.append(clade)
                cluster_labels.append(cluster)
    h, c, v = [round(i, 4) for i in hcv(truth, cluster_labels)]
    results.update(
        dict(zip('homogeneity completeness vscore'.split(), [h, c, v])))

    # Distance
    eprint('Median cosine distance btw/ points of different ecotypes ...')
    model = load_embedding(model)
    l = []
    for i, j in product(d['HL'], d['LL']):
        try:
            a = model.docvecs[i]
            b = model.docvecs[j]
            l.append(cosine(a, b))
        except KeyError:
            continue

    results['ecotypes_distance'] = round(float(1 - np.median(l)), 4)

    with open(outfile, 'w+') as out:
        json.dump(results, out, indent=4)
Exemplo n.º 4
0
def predict(genome, embedding, db, model, out, topn):
    '''
    From a <genome> w/ annotated protein domains predict a phenotype. Requires
    the learned <model> (genotype-phenotype mapping) as well as a genome
    <embedding>. Return the closest <topn> vectors from a database <db>.

    Usage:

    \b
    nanotext predict \\
        --out - \\
        --model data/media_prediction.h5 \\
        --db data/embedding.media.json \\
        --embedding data/embedding.genomes.model \\
        --genome data/TARA_ION_MAG_00012.domtbl.tsv \\
        --topn 3
    '''
    import json
    import os

    import numpy as np
    import tensorflow as tf
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # turn off debugging info
    from keras.models import load_model

    from nanotext.io import load_embedding, smart_open, eprint
    from nanotext.utils import infer_genome_vector

    eprint('Loading embedding model for genomes ...')
    e = load_embedding(embedding)
    eprint('Inferring genome vector ...')
    v = infer_genome_vector(genome, e)

    eprint('Loading media vector database ...')
    with open(db, 'r') as file:
        vv = json.load(file)  # vv .. vectors

    eprint('Loading predictive model ...')
    nn = load_model(model)

    y_hat = nn.predict(np.array([v]))[0]  # [0] .. only single genome for now
    sim = cosim2(y_hat, vv, topn)
    with smart_open(out) as fh:
        # fh.write('\nmedium\tcosine\n')
        for name, cos in sim:
            fh.write(f'{name}\t{round(cos, 4)}\n')
    eprint('Done.')
Exemplo n.º 5
0
    def __init__(self, fp, mode='ensemble', norm=None, names=None):

        self.mode = mode
        if mode == 'ensemble':
            nn = ['22', '45', '93']
        elif mode == 'core':
            nn = ['93']
        elif mode == 'accessory':
            nn = ['22']
        else:
            raise ValueError(
                'More not implemented (try "ensemble", "core" or "accessory")')

        self.models = []
        for n in nn:
            p = Path(fp) / f'{n}/nanotext_r89.model'
            model = load_embedding(str(p))
            self.models.append(model)

        self.norm = norm
        eprint('Subtracting mean from model(s) ...')
        self.nomean = self._demean(self.models)

        if not names:
            self.names = self.models[0].docvecs.index2entity

        self.dim = len(self.models[0].docvecs[0])

        eprint('Indexing model(s) ...')
        if norm:
            eprint(f'{self.norm} norm will be applied to vectors')
        found, m, self.index = index_model(self.names,
                                           [i for i in self.nomean], self.norm)
        self.embedding = dict(zip(found, m))

        self.means = []
        for model in self.models:
            mu = np.mean([model.docvecs[i] for i in range(len(model.docvecs))],
                         axis=0)
            self.means.append(mu)

        self.warn_on_ensemble_inference = False
Exemplo n.º 6
0
def cluster_subset(model, rank, name, taxonomy, outfile, soft, ecotypes,
                   projection_method):
    '''
    TODO: https://github.com/lmcinnes/umap/issues/90

    Iterate over a taxonomic rank such as class and cluster using HDBSCAN. One 
    reason we believe we can do this is that at higher ranks there are clear
    boundaries between organisms. The main motivation behind it is that
    HDBSCAN clusters are rather coarse when the whole dataset is clustered at 
    once, and "soft-clustering" does not scale.


    python ~/Dropbox/repos_git/nanotext/nanotext/workflows/train_nanotext/scripts/cluster.py -m nanotext_r89.model --name Clostridia --taxonomy /Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv -o clusters.tsv
    '''
    # args = Namespace(
    #     model='nanotext_r89.model',
    #     rank='class',
    #     name='Oxyphotobacteria',
    #     taxonomy='/Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv',
    #     outfile='clusters.tsv',
    #     soft=False,
    #     )

    args = Namespace(model=model,
                     rank=rank,
                     name=name,
                     taxonomy=taxonomy,
                     outfile=outfile,
                     soft=soft,
                     ecotypes=ecotypes)
    '''
    umap.UMAP(
    ['n_neighbors=15', 'n_components=2', "metric='euclidean'", 'n_epochs=None', 'learning_rate=1.0', "init='spectral'", 'min_dist=0.1', 'spread=1.0', 'set_op_mix_ratio=1.0', 'local_connectivity=1.0', 'repulsion_strength=1.0', 'negative_sample_rate=5', 'transform_queue_size=4.0', 'a=None', 'b=None', 'random_state=None', 'metric_kwds=None', 'angular_rp_forest=False', 'target_n_neighbors=-1', "target_metric='categorical'", 'target_metric_kwds=None', 'target_weight=0.5', 'transform_seed=42', 'verbose=False'],)
    '''
    config_umap_visualisation = {
        'metric': 'cosine',
        'n_components': 2,
        # 'n_neighbors': 5,
        # min_dist=0.05,
        # 'spread': 5,
        'random_state': 42,
    }

    config_umap_dim_reduction = {
        'metric': 'cosine',
        'n_components': 10,
        # 'n_neighbors': 10,
        # 'min_dist': 0.05,
        # 'spread': 5,
        'random_state': 42,
    }

    # min_cluster_size 3 leaf works great
    config_hdbscan = {
        'min_cluster_size': 5,
        # 'min_samples': 1,
        'cluster_selection_method': 'eom',
    }

    # Filter taxonomy file for a given rank
    names = []
    with open(args.taxonomy, 'r') as file:
        for line in file:
            if f'{args.rank[0]}__{args.name}' in line:  # d__ for domain etc.
                names.append(line.strip().split('\t')[0])
    eprint(f'There are {len(names)} data points for {args.rank} {args.name}.')

    # Extract only those vectors
    model = load_embedding(args.model)
    m, found = [], []
    for i in names:
        try:
            m.append(model.docvecs[strip_name(i)])
            found.append(strip_name(i))
        except KeyError:
            continue

    if args.ecotypes:
        ecotypes = {}
        with open(args.ecotypes) as csvfile:
            _ = next(csvfile)
            reader = csv.reader(csvfile, delimiter='\t')
            for row in reader:
                genome, e, curator = row[0], row[-2], row[-1]
                if '_' in e:
                    subtype = e
                    e = e.split('_')[0]
                else:
                    subtype = 'NA'
                ecotypes[genome] = (e, subtype, curator)

        # Extend sample list
        # TODO: this will be unnecessary once we have the r89 tax
        for i in ecotypes.keys():
            try:
                m.append(model.docvecs[i])
                found.append(i)
            except KeyError:
                continue

    m = np.array(m, dtype='float64')
    ratio = int(round(m.shape[0] / len(names), 2) * 100)
    eprint(f'Of those, {m.shape[0]} ({ratio}%) are present in the model.')

    pm = projection_method.upper()
    eprint(f'Projecting points (visualisation) using {pm} ...')
    if projection_method == 'tsne':
        projection = TSNE(n_components=2, random_state=42).fit_transform(m)
    elif projection_method == 'umap':
        reducer = umap.UMAP(**config_umap_visualisation)
        projection = reducer.fit_transform(m)
        # projection[-10:] == reducer.embedding_[-10:]
    else:
        eprint('No valid projection method. Abort!')
        sys.exit(-1)

    eprint('Projecting points (dimension reduction) ...')
    # Reduce dimensions before clustering
    reducer = umap.UMAP(**config_umap_dim_reduction)
    m_redux = reducer.fit_transform(m)

    eprint('Clustering ...')
    m_norm = normalize(m_redux, norm='l2', axis=1)
    # prediction_data=True, eom/ leaf
    clusterer = hdbscan.HDBSCAN(**config_hdbscan)
    cluster_labels = clusterer.fit_predict(projection)
    # Or soft clustering: init clusterer w/ prediction_data=True
    # soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
    # cluster_labels = [np.argmax(x) for x in soft_clusters]

    if args.ecotypes:
        with open(args.outfile, 'w+') as out:
            out.write('c1\tc2\tname\tclade\tsubclade\tcollection\tcluster\n')

            for i, j, k in zip(projection, found, cluster_labels):
                c1, c2 = i
                clade, subclade, curator = ecotypes.get(
                    j, 2 * ['NA'] + ['GTDB'])
                out.write(
                    f'{c1}\t{c2}\t{j}\t{clade}\t{subclade}\t{curator}\t{k}\n')
    else:
        with open(args.outfile, 'w+') as out:
            out.write('c1\tc2\tname\tcluster\n')

            for i, j, k in zip(projection, found, cluster_labels):
                c1, c2 = i
                out.write(f'{c1}\t{c2}\t{j}\t{k}\n')

    eprint('Done.')
Exemplo n.º 7
0
def taxonomy(query, taxonomy, embedding, topn, outfile, fmt, steps):
    '''
    Given a query vector, get the <n> closest vectors and their taxonomy and
    then report their <raw> taxonomy or use <majority vote> to identify the
    most likely (?) one.

    Usage:

    \b
    nanotext taxonomy \\
        --embedding nanotext_r89.model --taxonomy bac_taxonomy_r86.tsv \\
        --query JFOD01_pfam.tsv --fmt pfamscan --topn 10 -o results.json

    '''

    '''
    TODO: new fmt

    name, cos, ranks

    2nd output

    majority vote across columns
    '''
    from collections import Counter, defaultdict
    import json
    import pdb
    import random

    import numpy as np
    from sklearn.manifold import TSNE
    import umap

    from nanotext.io import load_taxonomy_gtdb, load_embedding, eprint
    from nanotext.utils import infer_genome_vector, strip_name


    config_umap_visualisation = {
        'metric': 'cosine',
        'n_components': 2,
        # 'repulsion_strength': 5,
        # 'n_neighbors': 5,
        # min_dist=0.05,
        # 'spread': 5,
        'random_state': 42,
        }


    ranks = [
        'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    notfound = []
    
    db = load_taxonomy_gtdb(taxonomy)
    model = load_embedding(embedding)

    v_query = infer_genome_vector(query, model, fmt=fmt, steps=steps)
    sim = model.docvecs.most_similar([v_query], topn=topn)


    taxcollector = {i: [] for i in ranks}
    distance = {}

    for name, cos_sim in sim:
        distance[name] = round(cos_sim, 4)
        try:
            for k, v in zip(ranks, db[name]):
                taxcollector[k].append(v)

        except KeyError:
            eprint(f'{name} has no taxonomy record')
            continue


    # What is the last uniform rank?
    cache = ()
    for i in ranks:
        if (len(set(taxcollector[i])) == 1) and (taxcollector[i][0] != ''):
            cache = (i, taxcollector[i][0])
            continue
        else:
            pass
    

    # p__Firmicutes_A
    # Collect the UIDs for this rank.
    eprint(f'Will collect all vectors for {cache[0]} {cache[1]} ...')
    names = []
    with open(taxonomy, 'r') as file:
        for line in file:
            # if 'c__Clostridia' in line:
            # if 'f__Pseudomonadaceae' in line:
            # if ('p__Firmicutes_A' in line) or ('p__Firmicutes_B' in line): 
            if f'{cache[0][0]}__{cache[1]}' in line:  # d__ for domain etc.
                names.append(line.strip().split('\t')[0])


    # Collect the associated document vector for each UID.
    m, found = [], []
    for name in names:
        try:
            m.append(model.docvecs[strip_name(name)])
            found.append(strip_name(name))
        except KeyError:
            continue


    # Project into 2D.
    eprint(f'Projecting with UMAP ...')
    m = np.array(m, dtype='float64')
    reducer = umap.UMAP(**config_umap_visualisation)
    # projection = reducer.fit_transform(m)
    eprint(f'Projecting with TSNE ...')
    projection = TSNE(n_components=2, random_state=42).fit_transform(m)

    results = defaultdict(list)
    # results['query'].extend(reducer.transform([v_query])[0])
    # results['query'].extend(7*['query'])

    for i, j in zip(found, projection):
        results[i].extend(j)
        results[i].extend(db[i])


    # Add distance info.
    for k, v in results.items():
        results[k].append(distance.get(k, 'NA'))

    # majority, majority_ratio = {}, {}

    # for rank, taxa in vote.items():
    #     results['raw'][rank] = taxa
    #     cnt = Counter(taxa)
    #     maxn = max(cnt.values())
    #     hits = [k for k, v in cnt.items() if v == maxn]
    #     pick = random.choice(hits)
    #     majority[rank] = pick
    #     majority_ratio[rank] = round(cnt[pick]/len(taxa), 2)

    # results['majority'] = majority
    # results['ratio'] = majority_ratio
    
    with open(outfile, 'w+') as out:
        out.write('\t'.join(
            'name c1 c2 domain phylum class order family genus species cos'.split())+'\n')
        # json.dump(dict(sorted(results.items())), out, indent=4)
        for k, v in results.items():
            line = '\t'.join([str(i) for i in [k]+v])+'\n'
            out.write(line)
Exemplo n.º 8
0
config_umap_dim_reduction = {
    'metric': 'cosine',
    'n_components': 10,
    # 'n_neighbors': 10,
    # 'min_dist': 0.05,
    # 'spread': 5,
    'random_state': 42,
}

config_hdbscan = {
    'min_cluster_size': 7,
    'min_samples': 1,
    'cluster_selection_method': 'eom',
}

model = load_embedding(args.model)
names = model.docvecs.index2entity

# https://github.com/scikit-learn-contrib/hdbscan
# because t-SNE for clustering is no good
# stats.stackexchange.com/questions/308132
m = []
for i in names:  # names .. just a list of accession IDs
    m.append(model.docvecs[i])
m = np.array(m, dtype='float64')

eprint('Projecting points (dimension reduction) ...')
# Reduce dimensions before clustering
reducer = umap.UMAP(**config_umap_dim_reduction)
m_redux = reducer.fit_transform(m)
m_norm = normalize(m_redux, norm='l2', axis=1)