def from_table(form): raw_table = pd.read_csv(form.input_file.data, delimiter='\t', comment='#', header=0, index_col=0).dropna(how='all') root = models.Tree.objects().get(source=form.target.data)['tree'] nodes = tree.get_nodes(root) entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes)) profile = [] for entry in raw_table.index: profile.append({ 'entry': entry, 'layer': analysis.get_layer(entry, entry_to_layer), 'values': [raw_table.ix[entry].tolist()] }) colors = [] if form.color_file.data: colors = pd.read_csv(form.color_file.data, header=None, delimiter='\t').as_matrix().tolist() utcnow = datetime.datetime.utcnow() return models.Profile( profile_id=uuid.uuid4(), profile=profile, series=['Raw'], columns=[raw_table.columns.tolist()], colors=colors, target=form.target.data, description=form.description.data, added_at=utcnow, expire_at=utcnow + datetime.timedelta(days=app.config['FUNCTREE_PROFILE_TTL_DAYS']), private=form.private.data).save().profile_id
def calc_coverages(df, target, result_holder, method='mean'): """ Replace this whole part by a call to the Omixer-RPM.jar, to reduce the computation time from 2 minutes to None """ root = models.Tree.objects().get(source=target)['tree'] definition = models.Definition.objects().get(source=target)['definition'] root = copy.deepcopy(root) nodes = tree.get_nodes(root) entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes)) tree.delete_children(root, 'module') nodes_no_ko = tree.get_nodes(root) ### HACK_START graphs = crckm.format_definition(definition) from tempfile import NamedTemporaryFile f = NamedTemporaryFile(delete=False) f.close() tmp_file = f.name df.to_csv(tmp_file, sep='\t', encoding='utf-8') df_crckm = crckm.calculate(ko_file=tmp_file, module_graphs=graphs, method=method, threshold=0) os.unlink(tmp_file) #tmp_out = "/tmp/out_dir" # call #kegg_db = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static/data/ortholog_mapping/module_definition_2017_ipath_version') #call("java -jar /opt/omixer-rpm/omixer-rpm.jar -i %s -o %s -a 2 -c 0 -d %s -e 2 -s average > /dev/null" % (tmp_file, tmp_out, kegg_db), shell=True, env=os.environ.copy()) # read the coeverage matrix #module_coverage = os.path.join(tmp_out, 'modules-coverage.tsv') #df_crckm = pd.read_csv(module_coverage, delimiter='\t', comment='#', header=0, index_col=0) ### HACK_END results = {} analysis.calc_abundances(df_crckm, nodes_no_ko, method, results) # Concatenate user's input and results df_out = df.applymap(lambda x: int(bool(x))).append(results[method]) result_holder["modulecoverage"] = df_out
def calc_abundances(f1, f2, target): df1 = analysis.load_input(f1) df2 = analysis.load_input(f2) root = models.Tree.objects().get(source=target)['tree'] nodes = tree.get_nodes(root) entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes)) # transform external annotations to kegg KOs if target.lower() in ["kegg", "foam", "enteropathway"]: df1 = analysis.map_external_annotations(df1) df2 = analysis.map_external_annotations(df2) results1 = {} analysis.calc_abundances(df1, nodes, 'mean', results1) results2 = {} analysis.calc_abundances(df2, nodes, 'mean', results2) df_result = pd.DataFrame() for entry in (set(results1['mean'].index) & set(results2['mean'].index)): try: result = scipy.stats.mannwhitneyu(results1['mean'].loc[entry], results2['mean'].loc[entry]) except ValueError: pass else: pvalue = result.pvalue if pvalue < 0.05: score = -np.log10(result.pvalue) else: score = 0 df_result.loc[entry, 'P-value'] = pvalue df_result.loc[entry, 'Score'] = score profile = [] for entry in df_result.index: values = [df_result.ix[entry].tolist()] profile.append({ 'entry': entry, 'layer': analysis.get_layer(entry, entry_to_layer), 'values': values }) data = { 'profile': profile, 'series': ['Mann-Whitney U test'], 'columns': [['P-value', 'Score']] } return data
def calc_abundances(df, nodes, method, results): """ Generates mean or sum for all levels of functional Tree FIXME: this is algorithmically sub-optimal as it goes over the tree randomly and also visits all the leaves. A better way could be: - map the lowest level first seen an unampped list of elements - summarize upper layers by matching them from children and then replace if something from the list is matched - repeat till root """ df_dict = {} for node in nodes: entry_profile = None # Compute value for nodes not in df_out if node['entry'] not in df_dict: # if leaf is reached if 'children' not in node: try: # If node in abundance matrix, input as is entry_profile = df.loc[node['entry']] except KeyError: pass else: # get leaf ids of the current node targets = [ child_node['entry'] for child_node in tree.get_nodes(node) if 'children' not in child_node ] try: # loc is row names of data frame loc = df.loc[targets] # sample abundance for a biological entry # Calculated for children of nodes that are not in the input abundance matrix entry_profile = eval('loc.{}()'.format(method)) except KeyError: pass # the entry on the tree is not in the submitted profile if entry_profile is not None: df_dict[node['entry']] = entry_profile.to_dict().values() df_out = pd.DataFrame.from_dict(df_dict, "index") if not df_out.empty: df_out.columns = df.columns df_out = df_out.dropna(how='all').fillna(0.0) results[method] = df_out