示例#1
0
def from_table(form):
    raw_table = pd.read_csv(form.input_file.data,
                            delimiter='\t',
                            comment='#',
                            header=0,
                            index_col=0).dropna(how='all')
    root = models.Tree.objects().get(source=form.target.data)['tree']
    nodes = tree.get_nodes(root)
    entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes))

    profile = []
    for entry in raw_table.index:
        profile.append({
            'entry': entry,
            'layer': analysis.get_layer(entry, entry_to_layer),
            'values': [raw_table.ix[entry].tolist()]
        })
    colors = []
    if form.color_file.data:
        colors = pd.read_csv(form.color_file.data, header=None,
                             delimiter='\t').as_matrix().tolist()
    utcnow = datetime.datetime.utcnow()
    return models.Profile(
        profile_id=uuid.uuid4(),
        profile=profile,
        series=['Raw'],
        columns=[raw_table.columns.tolist()],
        colors=colors,
        target=form.target.data,
        description=form.description.data,
        added_at=utcnow,
        expire_at=utcnow +
        datetime.timedelta(days=app.config['FUNCTREE_PROFILE_TTL_DAYS']),
        private=form.private.data).save().profile_id
示例#2
0
def calc_coverages(df, target, result_holder, method='mean'):
    """
    Replace this whole part by a call to the Omixer-RPM.jar, to reduce the computation time from 2 minutes to None
    """
    root = models.Tree.objects().get(source=target)['tree']

    definition = models.Definition.objects().get(source=target)['definition']
    root = copy.deepcopy(root)
    nodes = tree.get_nodes(root)
    entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes))
    tree.delete_children(root, 'module')
    nodes_no_ko = tree.get_nodes(root)

    ### HACK_START
    graphs = crckm.format_definition(definition)

    from tempfile import NamedTemporaryFile
    f = NamedTemporaryFile(delete=False)
    f.close()
    tmp_file = f.name
    df.to_csv(tmp_file, sep='\t', encoding='utf-8')
    df_crckm = crckm.calculate(ko_file=tmp_file,
                               module_graphs=graphs,
                               method=method,
                               threshold=0)
    os.unlink(tmp_file)
    #tmp_out = "/tmp/out_dir"
    # call
    #kegg_db = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static/data/ortholog_mapping/module_definition_2017_ipath_version')
    #call("java -jar /opt/omixer-rpm/omixer-rpm.jar -i %s -o %s -a 2 -c 0 -d %s -e 2 -s average > /dev/null" % (tmp_file, tmp_out, kegg_db), shell=True, env=os.environ.copy())
    # read the coeverage matrix
    #module_coverage = os.path.join(tmp_out, 'modules-coverage.tsv')
    #df_crckm = pd.read_csv(module_coverage, delimiter='\t', comment='#', header=0, index_col=0)
    ### HACK_END

    results = {}
    analysis.calc_abundances(df_crckm, nodes_no_ko, method, results)

    # Concatenate user's input and results
    df_out = df.applymap(lambda x: int(bool(x))).append(results[method])
    result_holder["modulecoverage"] = df_out
示例#3
0
def calc_abundances(f1, f2, target):
    df1 = analysis.load_input(f1)
    df2 = analysis.load_input(f2)
    root = models.Tree.objects().get(source=target)['tree']
    nodes = tree.get_nodes(root)
    entry_to_layer = dict(map(lambda x: (x['entry'], x['layer']), nodes))

    # transform external annotations to kegg KOs
    if target.lower() in ["kegg", "foam", "enteropathway"]:
        df1 = analysis.map_external_annotations(df1)
        df2 = analysis.map_external_annotations(df2)

    results1 = {}
    analysis.calc_abundances(df1, nodes, 'mean', results1)
    results2 = {}
    analysis.calc_abundances(df2, nodes, 'mean', results2)

    df_result = pd.DataFrame()
    for entry in (set(results1['mean'].index) & set(results2['mean'].index)):
        try:
            result = scipy.stats.mannwhitneyu(results1['mean'].loc[entry],
                                              results2['mean'].loc[entry])
        except ValueError:
            pass
        else:
            pvalue = result.pvalue
            if pvalue < 0.05:
                score = -np.log10(result.pvalue)
            else:
                score = 0
            df_result.loc[entry, 'P-value'] = pvalue
            df_result.loc[entry, 'Score'] = score

    profile = []
    for entry in df_result.index:
        values = [df_result.ix[entry].tolist()]
        profile.append({
            'entry': entry,
            'layer': analysis.get_layer(entry, entry_to_layer),
            'values': values
        })

    data = {
        'profile': profile,
        'series': ['Mann-Whitney U test'],
        'columns': [['P-value', 'Score']]
    }
    return data
示例#4
0
def calc_abundances(df, nodes, method, results):
    """
    Generates mean or sum for all levels of functional Tree
    
    FIXME: this is algorithmically sub-optimal as it goes over the tree randomly and also visits all the leaves.
        A better way could be:
            - map the lowest level first seen an unampped list of elements
            - summarize upper layers by matching them from children and then replace if something from the list is matched 
            - repeat till root
    """
    df_dict = {}
    for node in nodes:
        entry_profile = None
        # Compute value for nodes not in df_out
        if node['entry'] not in df_dict:
            # if leaf is reached
            if 'children' not in node:
                try:
                    # If node in abundance matrix, input as is
                    entry_profile = df.loc[node['entry']]
                except KeyError:
                    pass
            else:
                # get leaf ids of the current node
                targets = [
                    child_node['entry'] for child_node in tree.get_nodes(node)
                    if 'children' not in child_node
                ]
                try:
                    # loc is row names of data frame
                    loc = df.loc[targets]
                    # sample abundance for a biological entry
                    # Calculated for children of nodes that are not in the input abundance matrix
                    entry_profile = eval('loc.{}()'.format(method))
                except KeyError:
                    pass
            # the entry on the tree is not in the submitted profile
            if entry_profile is not None:
                df_dict[node['entry']] = entry_profile.to_dict().values()

    df_out = pd.DataFrame.from_dict(df_dict, "index")
    if not df_out.empty:
        df_out.columns = df.columns
        df_out = df_out.dropna(how='all').fillna(0.0)
    results[method] = df_out