Пример #1
0
def get_data(in_param_datasource, in_param_searchQuery, in_param_maxResults):
    result = None
    error = None
    response_status = 200

    qryStr = "MATCH (ds:Datasource {label: '" + in_param_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature) WHERE f.label contains '" + in_param_searchQuery + "' " \
             "RETURN f.label as gene, f.start as start, f.end as end, 'neo4j' as seqName, f.id as nodeId, f.taxonomy as level " \
             "ORDER BY f.depth limit " + in_param_maxResults
    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        result = []

        for index, row in df.iterrows():
            temp = row.to_dict()
            result.append(temp)

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500

    return result, error, response_status
Пример #2
0
def run(train_x, train_y, test_x, test_y, share=0.4, count=100):
    best_c = find_best_c(train_x, train_y, share, count)
    print(best_c)
    w1, w2 = train(train_x, train_y, best_c, count)
    print(w2)
    res = test(test_x, test_y, w1, w2)
    print(res)
    return utils.process_result(res)
Пример #3
0
def run(train_x, train_y, test_x, test_y, share=0.4, count=100):
    best_c = find_best_c(train_x, train_y, share, count)
    print(best_c)
    w1, w2 = train(train_x, train_y, best_c, count)
    print(w2)
    res = test(test_x, test_y, w1, w2)
    print(res)
    return utils.process_result(res)
Пример #4
0
def find_best_c(x, y, share):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_c = 2**-7
    best_f1 = 0
    for i in range(-7, 7):
        c = 2**i
        v = train(x_train, y_train, c)
        p, r = utils.process_result(test(x_check, y_check, v))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c
Пример #5
0
def find_best_c(x, y, share):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_c = 2 ** -7
    best_f1 = 0
    for i in range(-7, 7):
        c = 2 ** i
        v = train(x_train, y_train, c)
        p, r = utils.process_result(test(x_check, y_check, v))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
    return best_c
Пример #6
0
def find_best_c(x, y, share, count):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_f1 = 0
    best_c = -1
    c = 10
    while c <= 40:
        w1, w2 = train(x_train, y_train, c, count)
        p, r = utils.process_result(test(x_check, y_check, w1, w2))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
        c += 10
    return best_c
Пример #7
0
def find_best_c(x, y, share, count):
    x_train, x_check = utils.split_data(x, share)
    y_train, y_check = utils.split_data(y, share)

    best_f1 = 0
    best_c = -1
    c = 10
    while c <= 40:
        w1, w2 = train(x_train, y_train, c, count)
        p, r = utils.process_result(test(x_check, y_check, w1, w2))
        f1 = utils.f1(p, r)
        if f1 > best_f1:
            best_f1 = f1
            best_c = c
        c += 10
    return best_c
Пример #8
0
def get_data(in_datasource):
    """
    Returns the range of features in the database.  The cypher query finds the root of the Neo4j feature hierarchy and
    retrieves the start and end values which denote the range of features.

    Args:
     in_datasource: namspace to query

    Returns:
     arr: Feature range under root of tree
    """

    qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature {id:'0-0'}) RETURN  f.start as start, f.end as end"

    rq_res = utils.cypher_call(qryStr)
    df = utils.process_result(rq_res)

    arr = []
    arr.append([None, df['start'][0], df['end'][0]])

    return arr
Пример #9
0
def run(train_x, train_y, test_x, test_y, test_share=0.4):
    best_c = find_best_c(train_x, train_y, test_share)
    vector = train(train_x, train_y, best_c)
    # print(vector)
    res = test(test_x, test_y, vector)
    return utils.process_result(res)
Пример #10
0
def get_data(in_params_selectedLevels, in_params_samples, in_datasource):
    """
    Computes Alpha Diversity using the specified samples and level of hierarchy
    :param in_params_selectedLevels: Hierarchy level to compute Alpha Diversity
    :param in_params_samples: Samples to use for computing Alpha Diversity
    :return:

    Args:
        in_params_selectedLevels: Hierarchy level to compute Alpha Diversity
        in_params_samples: Samples to use for computing Alpha Diversity
        in_datasource: datasource to query
    Returns:
        resRowsCols: Alpha diversity for the samples at the selected level
    """

    tick_samples = in_params_samples.replace("\"", "\'")
    diversity_type = "shannon"
    # get the min selected Level if aggregated at multiple levels
    result = None
    error = None
    response_status = 200

    qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth  LIMIT 1"

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        minSelectedLevel = int(df['depth'].values[0])
        if minSelectedLevel is None:
            minSelectedLevel = 6

        for level in in_params_selectedLevels.keys():
            if in_params_selectedLevels[level] == 2 and int(
                    level) < minSelectedLevel:
                minSelectedLevel = int(level)

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500
        return result, error, response_status


    qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample) WHERE (f.depth=" + str(minSelectedLevel) + ") " \
        "AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg RETURN distinct agg, s.id, " \
        "f.label as label, f.leafIndex as index, f.end as end, f.start as start, f.id as id, f.lineage as lineage, " \
        "f.lineageLabel as lineageLabel, f.order as order"

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        forDiversityDF = df[["agg", "s.id", "label"]]

        forDiversityMat = pandas.pivot_table(df,
                                             index=["label"],
                                             columns="s.id",
                                             values="agg",
                                             fill_value=0)

        alphaDiversityVals = []
        cols = {}
        sample_ids = list(set(forDiversityDF["s.id"]))
        if diversity_type == "shannon":
            for i in range(0, len(sample_ids)):
                col_vals = forDiversityMat.ix[:, i].get_values()
                props = list()
                totalSum = col_vals.sum()

                for k in range(0, len(col_vals)):
                    temp_prop = float(col_vals[k] / totalSum)
                    if temp_prop != 0.0:
                        props.append(float((temp_prop * math.log(temp_prop))))
                    else:
                        props.append(0.0)

                nd_props = numpy.asarray(props, dtype=float)
                alphaDiversity = -(nd_props.sum())

                alphaDiversityVals.append(alphaDiversity)
                cols[forDiversityMat.columns.values[i]] = alphaDiversity

        sampleQryStr = "MATCH (s:Sample) WHERE s.id IN " + tick_samples + " RETURN s"

        sample_rq_res = utils.cypher_call(sampleQryStr)
        sample_df = utils.process_result_graph(sample_rq_res)

        vals = []
        for index, row in sample_df.iterrows():
            temp = {}
            for key in row.keys().values:
                temp[key] = row[key]
            temp['alphaDiversity'] = cols[row['id']]
            temp['sample_id'] = temp['id']
            del temp['id']
            vals.append(temp)

        result = {"data": vals}

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500

    return result, error, response_status
Пример #11
0
def run(train_x, train_y, test_x, test_y, steps=1000):
    vector = train(train_x, train_y, steps)
    # print(vector)
    res = test(test_x, test_y, vector)
    return utils.process_result(res)
Пример #12
0
def get_data(in_params_selection, in_params_order, in_params_selected_levels,
             in_params_nodeId, in_params_depth, in_datasource):
    """
    Finds and returns the hierarchy of the taxonomic features in the database. The hierarchy is traversed starting
    at the root node by using the PARENT_OF relationships the paths to until all leaf nodes are discovered.  The
    results are formatted according the the metaviz API specification.

    Args:
        in_params_selection: The samples selected
        in_params_order: The order of the features
        in_params_selected_levels: The levels for aggregation of each feature node or all nodes by default
        in_params_nodeId: The id of the root node
        in_params_depth: level depth to query at
        in_datasource: namespace to query
    Returns:
     result: Heirachy of levels in database

    """
    root_node = in_params_nodeId
    root_node = root_node.replace('"', "")

    taxonomy = False
    result = None
    error = None
    response_status = 200

    if len(root_node) == 0 or root_node == "0-0":
        root_node = "0-0"
        qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature {id:'" + root_node + "'})-[:PARENT_OF*0..3]->(f2:Feature) " \
                 "with collect(f2) + f as nodesFeat unwind nodesFeat as ff " \
                 "return distinct ff.lineage as lineage, ff.start as start, ff.label as label, " \
                 "ff.leafIndex as leafIndex, ff.parentId as parentId, ff.depth as depth, ff.partition as partition, " \
                 "ff.end as end, ff.id as id, ff.lineageLabel as lineageLabel, ff.nchildren as nchildren, " \
                 "ff.taxonomy as taxonomy, ff.nleaves as nleaves, ff.order as order ORDER by ff.depth, ff.leafIndex, ff.order"

        tQryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature) RETURN DISTINCT f.taxonomy as taxonomy, f.depth as depth ORDER BY f.depth"
        taxonomy = True
    else:
        qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature {id:'" + root_node + "'})-[:PARENT_OF*0..3]->(f2:Feature) " \
                 "OPTIONAL MATCH (f)<-[:PARENT_OF]-(fParent:Feature) with collect(f2) + f + fParent as nodesFeat " \
                 "unwind nodesFeat as ff return distinct ff.lineage as lineage, ff.start as start, " \
                 "ff.label as label, ff.leafIndex as leafIndex, ff.parentId as parentId, ff.depth as depth, " \
                 "ff.partition as partition, ff.end as end, ff.id as id, ff.lineageLabel as lineageLabel, " \
                 "ff.nchildren as nchildren, ff.taxonomy as taxonomy, ff.nleaves as nleaves, ff.order as order " \
                 "ORDER by ff.depth, ff.leafIndex, ff.order"
    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        if len(df) > 0:
            # convert columns to int
            df['start'] = df['start'].astype(int)
            df['end'] = df['end'].astype(int)
            df['order'] = df['order'].astype(int)
            df['leafIndex'] = df['leafIndex'].astype(int)
            df['nchildren'] = df['nchildren'].astype(int)
            df['nleaves'] = df['nleaves'].astype(int)
            df['depth'] = df['depth'].astype(int)
            df['depth'] = df['depth'].astype(int)

            # restore current order, selection and levels from input params
            for key in in_params_order.keys():
                df.loc[df['id'] == key, 'order'] = in_params_order[key]

            for key in in_params_selection.keys():
                df.loc[df['id'] == key,
                       'selectionType'] = in_params_selection[key]

            for key in in_params_selected_levels.keys():
                df.loc[df['depth'] == int(key),
                       'selectionType'] = in_params_selected_levels[key]

            root = df.iloc[0]
            other = df.loc[1:, ]

            rootDict = row_to_dict(root)
            result = df_to_tree(rootDict, other)

            if taxonomy:
                trq_res = utils.cypher_call(tQryStr)
                tdf = utils.process_result(trq_res)

                result['rootTaxonomies'] = tdf['taxonomy'].values.tolist()

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500

    return result, error, response_status
Пример #13
0
def get_data(in_params_selectedLevels, in_params_samples, in_datasource):
    """
    Computes PCA over the selected samples and the given level of the hierarchy

    Args:
     in_params_selectedLevels:  Level of hierarchy of features to compute PCA
     in_params_samples: Samples to use to compute PCA
     in_datasource: datasource to query

    Returns:
     resRowsCols: PCA for the samples at the selected level

    """

    tick_samples = in_params_samples.replace("\"", "\'")

    # get the min selected Level if aggregated at multiple levels

    qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth  LIMIT 1"

    result = None
    error = None
    response_status = 200

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500
        return result, error, response_status

    minSelectedLevel = int(df['depth'].values[0])
    if minSelectedLevel is None:
        minSelectedLevel = 6

    for level in in_params_selectedLevels.keys():
        if in_params_selectedLevels[level] == 2 and int(
                level) < minSelectedLevel:
            minSelectedLevel = int(level)

    qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample) WHERE (f.depth=" + str(minSelectedLevel) + ") " \
         "AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg RETURN distinct agg, s.id, f.label " \
         "as label, f.leafIndex as index, f.end as end, f.start as start, f.id as id, f.lineage as lineage, " \
         "f.lineageLabel as lineageLabel, f.order as order"

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        forPCAmat = pandas.pivot_table(df,
                                       index=["label"],
                                       columns="s.id",
                                       values="agg",
                                       fill_value=0)

        pca = PCA(n_components=2)
        pca.fit(forPCAmat)
        variance_explained = pca.explained_variance_ratio_

        cols = {}
        cols['PC1'] = pca.components_[0]
        cols['PC2'] = pca.components_[1]

        samplesQryStr = "MATCH (s:Sample) WHERE s.id IN " + tick_samples + " RETURN s"

        samples_rq_res = utils.cypher_call(samplesQryStr)
        samples_df = utils.process_result_graph(samples_rq_res)
        vals = []

        for index, row in samples_df.iterrows():
            temp = {}
            for key in row.keys().values:
                temp[key] = row[key]
            temp['PC1'] = cols['PC1'][index]
            temp['PC2'] = cols['PC2'][index]
            temp['sample_id'] = temp['id']
            del temp['id']
            vals.append(temp)

        result = {"data": vals}

        variance_explained[0] = round(variance_explained[0] * 100.0, 2)
        variance_explained[1] = round(variance_explained[1] * 100.0, 2)
        result['pca_variance_explained'] = variance_explained

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(
            error_info[2])
        response_status = 500

    return result, error, response_status
Пример #14
0
def get_data(in_params_start, in_params_end, in_params_order, in_params_selection, in_params_selectedLevels,
             in_params_samples, in_datasource):
    """
    Aggregates counts to the selected nodes in the feature hierarchy and returns the counts for the samples selected.

    Args:
        in_params_start: Start of range for features to use during aggregation
        in_params_end: End of range for features to use during aggregation
        in_params_order: Order of features
        in_params_selection: Features nodes and the selection type of expanded, aggregated, or removed
        in_params_selectedLevels: Level of the hierarchy to use
        in_params_samples: Samples to compute aggregation with

    Returns:
        resRowsCols: Aggregated counts for the selected Features over the selected Samples
    """
    tick_samples = in_params_samples.replace("\"", "\'")
    result = None
    error = None
    response_status = 200

    # get the min selected Level if aggregated at multiple levels
    qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth  LIMIT 1"

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        minSelectedLevel = int(df['depth'].values[0])
        if minSelectedLevel is None:
            minSelectedLevel = 6

        for level in in_params_selectedLevels.keys():
            if in_params_selectedLevels[level] == 2 and int(level) < minSelectedLevel:
                minSelectedLevel = int(level)

        # user selection nodes for custom aggregation - decides the cut
        selNodes = "["
        selFlag = 0
        for node in in_params_selection.keys():
            if in_params_selection[node] == 2:
                selNodes += "'" +  node + "',"
                selFlag = 1

        if selFlag == 1:
            selNodes = selNodes[:-1]
        selNodes += "]"

    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(error_info[2])
        response_status = 500
        return result, error, response_status


    qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'}) " \
        "MATCH (ds)-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature) MATCH (f)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample)" \
        "USING INDEX s:Sample(id) WHERE (f.depth=" + str(minSelectedLevel) + " OR f.id IN " + selNodes + ") AND " \
        "(f.start >= " + in_params_start + " AND " \
        "f.end <= " + in_params_end + ") AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg " \
        "RETURN distinct agg, s.id, f.label as label, f.leafIndex as index, f.end as end, f.start as start, " \
        "f.id as id, f.lineage as lineage, f.lineageLabel as lineageLabel, f.order as order"

    try:
        rq_res = utils.cypher_call(qryStr)
        df = utils.process_result(rq_res)

        if len(df) > 0:
            # change column type
            df['index'] = df['index'].astype(int)
            df['start'] = df['start'].astype(int)
            df['end'] = df['end'].astype(int)
            df['order'] = df['order'].astype(int)

            # update order based on req
            for key in in_params_order.keys():
                df.loc[df['id'] == key, 'order'] = in_params_order[key]

            for key in in_params_selection.keys():
                lKey = key.split('-')
                if int(lKey[0]) <= minSelectedLevel:
                    if in_params_selection[key] == 0:
                        # user selected nodes to ignore!
                        df = df[~df['lineage'].str.contains(key)]
                    elif in_params_selection[key] == 2:
                        df = df[~(df['lineage'].str.contains(key) & ~df['id'].str.contains(key))]

            # create a pivot_table where columns are samples and rows are features
            # for pandas > 0.17
            df_pivot = pandas.pivot_table(df,
                                    index=["id", "label", "index", "lineage", "lineageLabel", "start", "end", "order"],
                                    columns="s.id", values="agg", fill_value=0).sortlevel("index")

            cols = {}

            for col in df_pivot:
                cols[col] = df_pivot[col].values.tolist()

            rows = {}
            rows['metadata'] = {}

            metadata_row = ["end", "start", "index"]

            for row in df_pivot.index.names:
                if row in metadata_row:
                    rows[row] = df_pivot.index.get_level_values(row).values.tolist()
                else:
                    rows['metadata'][row] = df_pivot.index.get_level_values(row).values.tolist()

            result = {"cols": cols, "rows": rows, "globalStartIndex": (min(rows['start']))}

        else:
            cols = {}

            samples = eval(in_params_samples)

            for sa in samples:
                cols[sa] = []

            rows = { "end": [], "start": [], "index": [], "metadata": {} }
            result = {"cols": cols, "rows": rows, "globalStartIndex": None}
    except:
        error_info = sys.exc_info()
        error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(error_info[2])
        response_status = 500

    return result, error, response_status
Пример #15
0
def get_data(in_datasource):
    """
    This function returns the set of all samples in the database.  The first cypher query is finding all samples in the
    database.  The second cypher query is used to find the mix and max count value for
    all features across all samples.  This is return along with data source information including name and taxonomic
    hierarchy level names.

    Args:
     in_datasource: namespace to query

    Returns:
     result: Sample nodes information in database
    """
    qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->()-[LEAF_OF]->()<-[:COUNT]-(s:Sample)" \
             "RETURN DISTINCT ds,s"

    rq_res = utils.cypher_call(qryStr)
    df = utils.process_result(rq_res)
    measurements = []

    anno = []
    df.fillna(0, inplace=True)
    dsGroup = []
    dsId = []
    dsDescription = []

    for index, row in df.iterrows():
        temp = row['s']
        measurements.append(temp['id'])
        del temp['id']
        anno.append(temp)
        dsGroup.append(row['ds']['label'])
        dsId.append(row['ds']['label'])
        dsDescription.append(row['ds']['description'])

    rowQryStr = "MATCH ()-[r]-() WHERE EXISTS(r.val) RETURN min(r.val) as minVal, max(r.val) as maxVal"

    rq_res2 = utils.cypher_call(rowQryStr)
    df2 = utils.process_result(rq_res2)

    result = {
        "id":
        measurements,
        "name":
        measurements,
        "datasourceGroup":
        dsGroup,
        "datasourceId":
        dsId,
        "datasourceDescription":
        dsDescription,
        "defaultChartType":
        "",
        "type":
        "feature",
        "minValue":
        df2['minVal'][0],
        "maxValue":
        df2['maxVal'][0],
        "annotation":
        anno,
        "metadata": [
            "label", "id", "taxonomy1", "taxonomy2", "taxonomy3", "taxonomy4",
            "taxonomy5", "taxonomy6", "taxonomy7", "lineage"
        ]
    }

    return result
Пример #16
0
def run(train_x, train_y, test_x, test_y, test_share=0.4):
    best_c = find_best_c(train_x, train_y, test_share)
    vector = train(train_x, train_y, best_c)
    # print(vector)
    res = test(test_x, test_y, vector)
    return utils.process_result(res)