def evaluate(model, corpus, ecotypes, outfile): ''' A test battery: - SOMO task -- find the domain in a sequence that does not fit - Ecotype task -- separate niche-specific subpopulations of misc species Returns a list of accuracy - (sub)task pairs. ''' with open(outfile, 'w+') as out: # (1) eprint('SOMO task ...') SOMO = odd_one(corpus, model, n_not_odd=5) out.write(f'{str(SOMO)}\tSOMO\n') # (2) eprint('Ecotype task ...') m = load_embedding(model) for task in ['vibrio', 'prochlorococcus', 'pseudomonas']: rank, eco = load_ecotypes(f'{ecotypes}/{task}.tsv') d = {k: v2 for k, (v1, v2) in eco.items()} for k, v in ecotype_task(d, m).items(): if k != 'NA': out.write(f'{v}\t{k}\n') eprint('Evaluation done.')
def odd_one(fp_test_corpus, fp_model, n_not_odd=5): ''' SOMO task. ''' import random from nanotext.io import load_embedding model = load_embedding(fp_model) vocab = list(model.wv.vocab.keys()) pos, neg = 0, 0 with open(fp_test_corpus, 'r') as file: for line in file: genome, contig, domains = line.strip().split('\t') domains = domains.split(',') for seq in chunks(domains, n_not_odd): if len(seq) > 1: # otherwise its a 50:50 coin flip odd = random.choice(vocab) guess = model.wv.doesnt_match(seq + [odd]) if guess == odd: pos += 1 else: neg += 1 return round(pos / (pos + neg), 4)
def evaluate(model, corpus, outfile, clusters): ''' A test battery: - SOMO - king queen - various clusterings w/ associated tables - e coli - closridia - chlamydia - prochlorococcus closest genomes prochlorococcus or Tara ''' results = {} # SOMO task -- tests word embedding quality eprint('SOMO task ...') SOMO = odd_one(corpus, model, n_not_odd=5) results['SOMO'] = SOMO # Ecotype task -- tests document embedding quality eprint('Ecotype task ...') truth, cluster_labels = [], [] d = defaultdict(list) with open(clusters, 'r') as file: _ = next(file) # header for line in file: row = line.strip().split('\t') clade = row[3] cluster = row[6] if (clade in ['HL', 'LL']) and (cluster != -1): d[clade].append(row[2]) # genome UID truth.append(clade) cluster_labels.append(cluster) h, c, v = [round(i, 4) for i in hcv(truth, cluster_labels)] results.update( dict(zip('homogeneity completeness vscore'.split(), [h, c, v]))) # Distance eprint('Median cosine distance btw/ points of different ecotypes ...') model = load_embedding(model) l = [] for i, j in product(d['HL'], d['LL']): try: a = model.docvecs[i] b = model.docvecs[j] l.append(cosine(a, b)) except KeyError: continue results['ecotypes_distance'] = round(float(1 - np.median(l)), 4) with open(outfile, 'w+') as out: json.dump(results, out, indent=4)
def predict(genome, embedding, db, model, out, topn): ''' From a <genome> w/ annotated protein domains predict a phenotype. Requires the learned <model> (genotype-phenotype mapping) as well as a genome <embedding>. Return the closest <topn> vectors from a database <db>. Usage: \b nanotext predict \\ --out - \\ --model data/media_prediction.h5 \\ --db data/embedding.media.json \\ --embedding data/embedding.genomes.model \\ --genome data/TARA_ION_MAG_00012.domtbl.tsv \\ --topn 3 ''' import json import os import numpy as np import tensorflow as tf os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # turn off debugging info from keras.models import load_model from nanotext.io import load_embedding, smart_open, eprint from nanotext.utils import infer_genome_vector eprint('Loading embedding model for genomes ...') e = load_embedding(embedding) eprint('Inferring genome vector ...') v = infer_genome_vector(genome, e) eprint('Loading media vector database ...') with open(db, 'r') as file: vv = json.load(file) # vv .. vectors eprint('Loading predictive model ...') nn = load_model(model) y_hat = nn.predict(np.array([v]))[0] # [0] .. only single genome for now sim = cosim2(y_hat, vv, topn) with smart_open(out) as fh: # fh.write('\nmedium\tcosine\n') for name, cos in sim: fh.write(f'{name}\t{round(cos, 4)}\n') eprint('Done.')
def __init__(self, fp, mode='ensemble', norm=None, names=None): self.mode = mode if mode == 'ensemble': nn = ['22', '45', '93'] elif mode == 'core': nn = ['93'] elif mode == 'accessory': nn = ['22'] else: raise ValueError( 'More not implemented (try "ensemble", "core" or "accessory")') self.models = [] for n in nn: p = Path(fp) / f'{n}/nanotext_r89.model' model = load_embedding(str(p)) self.models.append(model) self.norm = norm eprint('Subtracting mean from model(s) ...') self.nomean = self._demean(self.models) if not names: self.names = self.models[0].docvecs.index2entity self.dim = len(self.models[0].docvecs[0]) eprint('Indexing model(s) ...') if norm: eprint(f'{self.norm} norm will be applied to vectors') found, m, self.index = index_model(self.names, [i for i in self.nomean], self.norm) self.embedding = dict(zip(found, m)) self.means = [] for model in self.models: mu = np.mean([model.docvecs[i] for i in range(len(model.docvecs))], axis=0) self.means.append(mu) self.warn_on_ensemble_inference = False
def cluster_subset(model, rank, name, taxonomy, outfile, soft, ecotypes, projection_method): ''' TODO: https://github.com/lmcinnes/umap/issues/90 Iterate over a taxonomic rank such as class and cluster using HDBSCAN. One reason we believe we can do this is that at higher ranks there are clear boundaries between organisms. The main motivation behind it is that HDBSCAN clusters are rather coarse when the whole dataset is clustered at once, and "soft-clustering" does not scale. python ~/Dropbox/repos_git/nanotext/nanotext/workflows/train_nanotext/scripts/cluster.py -m nanotext_r89.model --name Clostridia --taxonomy /Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv -o clusters.tsv ''' # args = Namespace( # model='nanotext_r89.model', # rank='class', # name='Oxyphotobacteria', # taxonomy='/Users/phi/data_local/databases/gtdb/bac_taxonomy_r83.tsv', # outfile='clusters.tsv', # soft=False, # ) args = Namespace(model=model, rank=rank, name=name, taxonomy=taxonomy, outfile=outfile, soft=soft, ecotypes=ecotypes) ''' umap.UMAP( ['n_neighbors=15', 'n_components=2', "metric='euclidean'", 'n_epochs=None', 'learning_rate=1.0', "init='spectral'", 'min_dist=0.1', 'spread=1.0', 'set_op_mix_ratio=1.0', 'local_connectivity=1.0', 'repulsion_strength=1.0', 'negative_sample_rate=5', 'transform_queue_size=4.0', 'a=None', 'b=None', 'random_state=None', 'metric_kwds=None', 'angular_rp_forest=False', 'target_n_neighbors=-1', "target_metric='categorical'", 'target_metric_kwds=None', 'target_weight=0.5', 'transform_seed=42', 'verbose=False'],) ''' config_umap_visualisation = { 'metric': 'cosine', 'n_components': 2, # 'n_neighbors': 5, # min_dist=0.05, # 'spread': 5, 'random_state': 42, } config_umap_dim_reduction = { 'metric': 'cosine', 'n_components': 10, # 'n_neighbors': 10, # 'min_dist': 0.05, # 'spread': 5, 'random_state': 42, } # min_cluster_size 3 leaf works great config_hdbscan = { 'min_cluster_size': 5, # 'min_samples': 1, 'cluster_selection_method': 'eom', } # Filter taxonomy file for a given rank names = [] with open(args.taxonomy, 'r') as file: for line in file: if f'{args.rank[0]}__{args.name}' in line: # d__ for domain etc. names.append(line.strip().split('\t')[0]) eprint(f'There are {len(names)} data points for {args.rank} {args.name}.') # Extract only those vectors model = load_embedding(args.model) m, found = [], [] for i in names: try: m.append(model.docvecs[strip_name(i)]) found.append(strip_name(i)) except KeyError: continue if args.ecotypes: ecotypes = {} with open(args.ecotypes) as csvfile: _ = next(csvfile) reader = csv.reader(csvfile, delimiter='\t') for row in reader: genome, e, curator = row[0], row[-2], row[-1] if '_' in e: subtype = e e = e.split('_')[0] else: subtype = 'NA' ecotypes[genome] = (e, subtype, curator) # Extend sample list # TODO: this will be unnecessary once we have the r89 tax for i in ecotypes.keys(): try: m.append(model.docvecs[i]) found.append(i) except KeyError: continue m = np.array(m, dtype='float64') ratio = int(round(m.shape[0] / len(names), 2) * 100) eprint(f'Of those, {m.shape[0]} ({ratio}%) are present in the model.') pm = projection_method.upper() eprint(f'Projecting points (visualisation) using {pm} ...') if projection_method == 'tsne': projection = TSNE(n_components=2, random_state=42).fit_transform(m) elif projection_method == 'umap': reducer = umap.UMAP(**config_umap_visualisation) projection = reducer.fit_transform(m) # projection[-10:] == reducer.embedding_[-10:] else: eprint('No valid projection method. Abort!') sys.exit(-1) eprint('Projecting points (dimension reduction) ...') # Reduce dimensions before clustering reducer = umap.UMAP(**config_umap_dim_reduction) m_redux = reducer.fit_transform(m) eprint('Clustering ...') m_norm = normalize(m_redux, norm='l2', axis=1) # prediction_data=True, eom/ leaf clusterer = hdbscan.HDBSCAN(**config_hdbscan) cluster_labels = clusterer.fit_predict(projection) # Or soft clustering: init clusterer w/ prediction_data=True # soft_clusters = hdbscan.all_points_membership_vectors(clusterer) # cluster_labels = [np.argmax(x) for x in soft_clusters] if args.ecotypes: with open(args.outfile, 'w+') as out: out.write('c1\tc2\tname\tclade\tsubclade\tcollection\tcluster\n') for i, j, k in zip(projection, found, cluster_labels): c1, c2 = i clade, subclade, curator = ecotypes.get( j, 2 * ['NA'] + ['GTDB']) out.write( f'{c1}\t{c2}\t{j}\t{clade}\t{subclade}\t{curator}\t{k}\n') else: with open(args.outfile, 'w+') as out: out.write('c1\tc2\tname\tcluster\n') for i, j, k in zip(projection, found, cluster_labels): c1, c2 = i out.write(f'{c1}\t{c2}\t{j}\t{k}\n') eprint('Done.')
def taxonomy(query, taxonomy, embedding, topn, outfile, fmt, steps): ''' Given a query vector, get the <n> closest vectors and their taxonomy and then report their <raw> taxonomy or use <majority vote> to identify the most likely (?) one. Usage: \b nanotext taxonomy \\ --embedding nanotext_r89.model --taxonomy bac_taxonomy_r86.tsv \\ --query JFOD01_pfam.tsv --fmt pfamscan --topn 10 -o results.json ''' ''' TODO: new fmt name, cos, ranks 2nd output majority vote across columns ''' from collections import Counter, defaultdict import json import pdb import random import numpy as np from sklearn.manifold import TSNE import umap from nanotext.io import load_taxonomy_gtdb, load_embedding, eprint from nanotext.utils import infer_genome_vector, strip_name config_umap_visualisation = { 'metric': 'cosine', 'n_components': 2, # 'repulsion_strength': 5, # 'n_neighbors': 5, # min_dist=0.05, # 'spread': 5, 'random_state': 42, } ranks = [ 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'] notfound = [] db = load_taxonomy_gtdb(taxonomy) model = load_embedding(embedding) v_query = infer_genome_vector(query, model, fmt=fmt, steps=steps) sim = model.docvecs.most_similar([v_query], topn=topn) taxcollector = {i: [] for i in ranks} distance = {} for name, cos_sim in sim: distance[name] = round(cos_sim, 4) try: for k, v in zip(ranks, db[name]): taxcollector[k].append(v) except KeyError: eprint(f'{name} has no taxonomy record') continue # What is the last uniform rank? cache = () for i in ranks: if (len(set(taxcollector[i])) == 1) and (taxcollector[i][0] != ''): cache = (i, taxcollector[i][0]) continue else: pass # p__Firmicutes_A # Collect the UIDs for this rank. eprint(f'Will collect all vectors for {cache[0]} {cache[1]} ...') names = [] with open(taxonomy, 'r') as file: for line in file: # if 'c__Clostridia' in line: # if 'f__Pseudomonadaceae' in line: # if ('p__Firmicutes_A' in line) or ('p__Firmicutes_B' in line): if f'{cache[0][0]}__{cache[1]}' in line: # d__ for domain etc. names.append(line.strip().split('\t')[0]) # Collect the associated document vector for each UID. m, found = [], [] for name in names: try: m.append(model.docvecs[strip_name(name)]) found.append(strip_name(name)) except KeyError: continue # Project into 2D. eprint(f'Projecting with UMAP ...') m = np.array(m, dtype='float64') reducer = umap.UMAP(**config_umap_visualisation) # projection = reducer.fit_transform(m) eprint(f'Projecting with TSNE ...') projection = TSNE(n_components=2, random_state=42).fit_transform(m) results = defaultdict(list) # results['query'].extend(reducer.transform([v_query])[0]) # results['query'].extend(7*['query']) for i, j in zip(found, projection): results[i].extend(j) results[i].extend(db[i]) # Add distance info. for k, v in results.items(): results[k].append(distance.get(k, 'NA')) # majority, majority_ratio = {}, {} # for rank, taxa in vote.items(): # results['raw'][rank] = taxa # cnt = Counter(taxa) # maxn = max(cnt.values()) # hits = [k for k, v in cnt.items() if v == maxn] # pick = random.choice(hits) # majority[rank] = pick # majority_ratio[rank] = round(cnt[pick]/len(taxa), 2) # results['majority'] = majority # results['ratio'] = majority_ratio with open(outfile, 'w+') as out: out.write('\t'.join( 'name c1 c2 domain phylum class order family genus species cos'.split())+'\n') # json.dump(dict(sorted(results.items())), out, indent=4) for k, v in results.items(): line = '\t'.join([str(i) for i in [k]+v])+'\n' out.write(line)
config_umap_dim_reduction = { 'metric': 'cosine', 'n_components': 10, # 'n_neighbors': 10, # 'min_dist': 0.05, # 'spread': 5, 'random_state': 42, } config_hdbscan = { 'min_cluster_size': 7, 'min_samples': 1, 'cluster_selection_method': 'eom', } model = load_embedding(args.model) names = model.docvecs.index2entity # https://github.com/scikit-learn-contrib/hdbscan # because t-SNE for clustering is no good # stats.stackexchange.com/questions/308132 m = [] for i in names: # names .. just a list of accession IDs m.append(model.docvecs[i]) m = np.array(m, dtype='float64') eprint('Projecting points (dimension reduction) ...') # Reduce dimensions before clustering reducer = umap.UMAP(**config_umap_dim_reduction) m_redux = reducer.fit_transform(m) m_norm = normalize(m_redux, norm='l2', axis=1)