def genotype_and_saving(inputs, pool, **params) : input_idx = {i[3]:i[0] for i in inputs} barcode_dist = params['barcode_dist'] codes = {i[0]:[i[4] for b in params['barcode_dist']] for i in inputs} res = {} for r in pool.map(utils.run_mash, [[i[2], None, 1, params] for i in inputs]) : if len(r) > 0 : res[input_idx[r[0][0]]] = r[0] merged_input = os.path.join(params['dbname'], 'merged_input.msh') subprocess.Popen('{mash} paste {0} {1}'.format(merged_input, ' '.join([i[2] for i in inputs]), **params).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() for r in utils.run_mash([merged_input, merged_input, params['n_thread'], params]) : if input_idx[r[0]] > input_idx[r[1]] : if input_idx[r[0]] not in res or res[input_idx[r[0]]][2] > r[2] : r[1] = input_idx[r[1]] res[input_idx[r[0]]] = r os.unlink(merged_input) for idx, code in sorted(codes.iteritems()) : if res.get(idx, None) is not None : best_sim, best_hit = res[idx][2], res[idx][1] if isinstance(best_hit, basestring) : best_hit = [int(r[1:]) for r in best_hit.split('.')] else : best_hit = codes[best_hit] for i, d in enumerate(params['barcode_dist']) : if d >= best_sim : code[i] = int(best_hit[i]) else : break return save2mash(inputs, codes, **params)
def query_sample(params) : params = utils.load_paramDict(params) #params = utils.load_params(sys.argv) assert 'query' in params and os.path.isfile(params['query']), 'no query' existing_data = os.path.join(params['dbname'], 'db_metadata.msg') assert existing_data, 'no data in the database.' data = pd.read_msgpack(existing_data) if params.get('dtype', 'fasta') == 'read' : msh_file = utils.get_mash(params['query'], is_read=True, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params], is_read=True) else : msh_file = utils.get_mash(params['query'], is_read=False, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params]) os.unlink(msh_file) if len(result) > 0 : r_id = np.array([r[2] for r in result]) result = np.array([r[1].split('.') for r in result]) result, r_id = result[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])], r_id[(1-r_id >= 0.98* (1-r_id[0])) & (r_id <= params['barcode_dist'][0])] groups = {} m = {'a'+k:[n, a] for k, n, a in data[['index', 'organism_name', 'assembly_accession']].as_matrix()} matches = [ dict(record='.'.join(r), similarity=1-i, organism_name=m[r[-1]][0], assembly_accession=m[r[-1]][1]) for r, i in zip(result, r_id) ] for id, (dcut, dgroup) in enumerate(zip(params['barcode_dist'], result.T[:-1])) : dgroup[r_id > dcut] = '' g = np.unique(dgroup, return_index=True) tags = ['.'.join(r) for r in result[g[1], :(id+1)] if r[-1] != ''] info = [ [i, -id, '.'.join(hit)] for i, hit in zip(r_id[g[1]], result[g[1]]) if hit[id] != '' ] for t, i in zip(tags, info) : groups[t] = i groups = [dict(group=c, similarity=1.0-d[0]) for c, d in sorted(groups.iteritems(), key=lambda x:x[1])] for g in groups : g.update(utils.retrieve_info(g['group'], data=data, **params)) else : groups, matches, result = [], [], 'unknown' print json.dumps(dict(groups=groups, matches=matches), sort_keys=True, indent=2)
import os, sys, pandas as pd, numpy as np, json, msgpack import utils if __name__ == '__main__': params = utils.load_params(sys.argv) assert 'query' in params and os.path.isfile(params['query']), 'no query' existing_data = os.path.join(params['dbname'], 'db_metadata.msg') assert existing_data, 'no data in the database.' data = pd.read_msgpack(existing_data) if params.get('dtype', 'fasta') == 'read': msh_file = utils.get_mash(params['query'], is_read=True, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params], is_read=True) else: msh_file = utils.get_mash(params['query'], is_read=False, **params) result = utils.run_mash([msh_file, None, params['n_thread'], params]) os.unlink(msh_file) if len(result) > 0: r_id = np.array([r[2] for r in result]) result = np.array([r[1].split('.') for r in result]) result, r_id = result[(1 - r_id >= 0.98 * (1 - r_id[0])) & (r_id <= params['barcode_dist'][0])], r_id[ (1 - r_id >= 0.98 * (1 - r_id[0])) & (r_id <= params['barcode_dist'][0])] groups = {} m = { 'a' + k: [n, a]