def get_data(in_param_datasource, in_param_searchQuery, in_param_maxResults): result = None error = None response_status = 200 qryStr = "MATCH (ds:Datasource {label: '" + in_param_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature) WHERE f.label contains '" + in_param_searchQuery + "' " \ "RETURN f.label as gene, f.start as start, f.end as end, 'neo4j' as seqName, f.id as nodeId, f.taxonomy as level " \ "ORDER BY f.depth limit " + in_param_maxResults try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) result = [] for index, row in df.iterrows(): temp = row.to_dict() result.append(temp) except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status
def run(train_x, train_y, test_x, test_y, share=0.4, count=100): best_c = find_best_c(train_x, train_y, share, count) print(best_c) w1, w2 = train(train_x, train_y, best_c, count) print(w2) res = test(test_x, test_y, w1, w2) print(res) return utils.process_result(res)
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2**-7 best_f1 = 0 for i in range(-7, 7): c = 2**i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def find_best_c(x, y, share): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_c = 2 ** -7 best_f1 = 0 for i in range(-7, 7): c = 2 ** i v = train(x_train, y_train, c) p, r = utils.process_result(test(x_check, y_check, v)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c return best_c
def find_best_c(x, y, share, count): x_train, x_check = utils.split_data(x, share) y_train, y_check = utils.split_data(y, share) best_f1 = 0 best_c = -1 c = 10 while c <= 40: w1, w2 = train(x_train, y_train, c, count) p, r = utils.process_result(test(x_check, y_check, w1, w2)) f1 = utils.f1(p, r) if f1 > best_f1: best_f1 = f1 best_c = c c += 10 return best_c
def get_data(in_datasource): """ Returns the range of features in the database. The cypher query finds the root of the Neo4j feature hierarchy and retrieves the start and end values which denote the range of features. Args: in_datasource: namspace to query Returns: arr: Feature range under root of tree """ qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature {id:'0-0'}) RETURN f.start as start, f.end as end" rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) arr = [] arr.append([None, df['start'][0], df['end'][0]]) return arr
def run(train_x, train_y, test_x, test_y, test_share=0.4): best_c = find_best_c(train_x, train_y, test_share) vector = train(train_x, train_y, best_c) # print(vector) res = test(test_x, test_y, vector) return utils.process_result(res)
def get_data(in_params_selectedLevels, in_params_samples, in_datasource): """ Computes Alpha Diversity using the specified samples and level of hierarchy :param in_params_selectedLevels: Hierarchy level to compute Alpha Diversity :param in_params_samples: Samples to use for computing Alpha Diversity :return: Args: in_params_selectedLevels: Hierarchy level to compute Alpha Diversity in_params_samples: Samples to use for computing Alpha Diversity in_datasource: datasource to query Returns: resRowsCols: Alpha diversity for the samples at the selected level """ tick_samples = in_params_samples.replace("\"", "\'") diversity_type = "shannon" # get the min selected Level if aggregated at multiple levels result = None error = None response_status = 200 qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth LIMIT 1" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) minSelectedLevel = int(df['depth'].values[0]) if minSelectedLevel is None: minSelectedLevel = 6 for level in in_params_selectedLevels.keys(): if in_params_selectedLevels[level] == 2 and int( level) < minSelectedLevel: minSelectedLevel = int(level) except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample) WHERE (f.depth=" + str(minSelectedLevel) + ") " \ "AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg RETURN distinct agg, s.id, " \ "f.label as label, f.leafIndex as index, f.end as end, f.start as start, f.id as id, f.lineage as lineage, " \ "f.lineageLabel as lineageLabel, f.order as order" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) forDiversityDF = df[["agg", "s.id", "label"]] forDiversityMat = pandas.pivot_table(df, index=["label"], columns="s.id", values="agg", fill_value=0) alphaDiversityVals = [] cols = {} sample_ids = list(set(forDiversityDF["s.id"])) if diversity_type == "shannon": for i in range(0, len(sample_ids)): col_vals = forDiversityMat.ix[:, i].get_values() props = list() totalSum = col_vals.sum() for k in range(0, len(col_vals)): temp_prop = float(col_vals[k] / totalSum) if temp_prop != 0.0: props.append(float((temp_prop * math.log(temp_prop)))) else: props.append(0.0) nd_props = numpy.asarray(props, dtype=float) alphaDiversity = -(nd_props.sum()) alphaDiversityVals.append(alphaDiversity) cols[forDiversityMat.columns.values[i]] = alphaDiversity sampleQryStr = "MATCH (s:Sample) WHERE s.id IN " + tick_samples + " RETURN s" sample_rq_res = utils.cypher_call(sampleQryStr) sample_df = utils.process_result_graph(sample_rq_res) vals = [] for index, row in sample_df.iterrows(): temp = {} for key in row.keys().values: temp[key] = row[key] temp['alphaDiversity'] = cols[row['id']] temp['sample_id'] = temp['id'] del temp['id'] vals.append(temp) result = {"data": vals} except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status
def run(train_x, train_y, test_x, test_y, steps=1000): vector = train(train_x, train_y, steps) # print(vector) res = test(test_x, test_y, vector) return utils.process_result(res)
def get_data(in_params_selection, in_params_order, in_params_selected_levels, in_params_nodeId, in_params_depth, in_datasource): """ Finds and returns the hierarchy of the taxonomic features in the database. The hierarchy is traversed starting at the root node by using the PARENT_OF relationships the paths to until all leaf nodes are discovered. The results are formatted according the the metaviz API specification. Args: in_params_selection: The samples selected in_params_order: The order of the features in_params_selected_levels: The levels for aggregation of each feature node or all nodes by default in_params_nodeId: The id of the root node in_params_depth: level depth to query at in_datasource: namespace to query Returns: result: Heirachy of levels in database """ root_node = in_params_nodeId root_node = root_node.replace('"', "") taxonomy = False result = None error = None response_status = 200 if len(root_node) == 0 or root_node == "0-0": root_node = "0-0" qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature {id:'" + root_node + "'})-[:PARENT_OF*0..3]->(f2:Feature) " \ "with collect(f2) + f as nodesFeat unwind nodesFeat as ff " \ "return distinct ff.lineage as lineage, ff.start as start, ff.label as label, " \ "ff.leafIndex as leafIndex, ff.parentId as parentId, ff.depth as depth, ff.partition as partition, " \ "ff.end as end, ff.id as id, ff.lineageLabel as lineageLabel, ff.nchildren as nchildren, " \ "ff.taxonomy as taxonomy, ff.nleaves as nleaves, ff.order as order ORDER by ff.depth, ff.leafIndex, ff.order" tQryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(f:Feature) RETURN DISTINCT f.taxonomy as taxonomy, f.depth as depth ORDER BY f.depth" taxonomy = True else: qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature {id:'" + root_node + "'})-[:PARENT_OF*0..3]->(f2:Feature) " \ "OPTIONAL MATCH (f)<-[:PARENT_OF]-(fParent:Feature) with collect(f2) + f + fParent as nodesFeat " \ "unwind nodesFeat as ff return distinct ff.lineage as lineage, ff.start as start, " \ "ff.label as label, ff.leafIndex as leafIndex, ff.parentId as parentId, ff.depth as depth, " \ "ff.partition as partition, ff.end as end, ff.id as id, ff.lineageLabel as lineageLabel, " \ "ff.nchildren as nchildren, ff.taxonomy as taxonomy, ff.nleaves as nleaves, ff.order as order " \ "ORDER by ff.depth, ff.leafIndex, ff.order" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) if len(df) > 0: # convert columns to int df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) df['order'] = df['order'].astype(int) df['leafIndex'] = df['leafIndex'].astype(int) df['nchildren'] = df['nchildren'].astype(int) df['nleaves'] = df['nleaves'].astype(int) df['depth'] = df['depth'].astype(int) df['depth'] = df['depth'].astype(int) # restore current order, selection and levels from input params for key in in_params_order.keys(): df.loc[df['id'] == key, 'order'] = in_params_order[key] for key in in_params_selection.keys(): df.loc[df['id'] == key, 'selectionType'] = in_params_selection[key] for key in in_params_selected_levels.keys(): df.loc[df['depth'] == int(key), 'selectionType'] = in_params_selected_levels[key] root = df.iloc[0] other = df.loc[1:, ] rootDict = row_to_dict(root) result = df_to_tree(rootDict, other) if taxonomy: trq_res = utils.cypher_call(tQryStr) tdf = utils.process_result(trq_res) result['rootTaxonomies'] = tdf['taxonomy'].values.tolist() except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status
def get_data(in_params_selectedLevels, in_params_samples, in_datasource): """ Computes PCA over the selected samples and the given level of the hierarchy Args: in_params_selectedLevels: Level of hierarchy of features to compute PCA in_params_samples: Samples to use to compute PCA in_datasource: datasource to query Returns: resRowsCols: PCA for the samples at the selected level """ tick_samples = in_params_samples.replace("\"", "\'") # get the min selected Level if aggregated at multiple levels qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth LIMIT 1" result = None error = None response_status = 200 try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status minSelectedLevel = int(df['depth'].values[0]) if minSelectedLevel is None: minSelectedLevel = 6 for level in in_params_selectedLevels.keys(): if in_params_selectedLevels[level] == 2 and int( level) < minSelectedLevel: minSelectedLevel = int(level) qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample) WHERE (f.depth=" + str(minSelectedLevel) + ") " \ "AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg RETURN distinct agg, s.id, f.label " \ "as label, f.leafIndex as index, f.end as end, f.start as start, f.id as id, f.lineage as lineage, " \ "f.lineageLabel as lineageLabel, f.order as order" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) forPCAmat = pandas.pivot_table(df, index=["label"], columns="s.id", values="agg", fill_value=0) pca = PCA(n_components=2) pca.fit(forPCAmat) variance_explained = pca.explained_variance_ratio_ cols = {} cols['PC1'] = pca.components_[0] cols['PC2'] = pca.components_[1] samplesQryStr = "MATCH (s:Sample) WHERE s.id IN " + tick_samples + " RETURN s" samples_rq_res = utils.cypher_call(samplesQryStr) samples_df = utils.process_result_graph(samples_rq_res) vals = [] for index, row in samples_df.iterrows(): temp = {} for key in row.keys().values: temp[key] = row[key] temp['PC1'] = cols['PC1'][index] temp['PC2'] = cols['PC2'][index] temp['sample_id'] = temp['id'] del temp['id'] vals.append(temp) result = {"data": vals} variance_explained[0] = round(variance_explained[0] * 100.0, 2) variance_explained[1] = round(variance_explained[1] * 100.0, 2) result['pca_variance_explained'] = variance_explained except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str( error_info[2]) response_status = 500 return result, error, response_status
def get_data(in_params_start, in_params_end, in_params_order, in_params_selection, in_params_selectedLevels, in_params_samples, in_datasource): """ Aggregates counts to the selected nodes in the feature hierarchy and returns the counts for the samples selected. Args: in_params_start: Start of range for features to use during aggregation in_params_end: End of range for features to use during aggregation in_params_order: Order of features in_params_selection: Features nodes and the selection type of expanded, aggregated, or removed in_params_selectedLevels: Level of the hierarchy to use in_params_samples: Samples to compute aggregation with Returns: resRowsCols: Aggregated counts for the selected Features over the selected Samples """ tick_samples = in_params_samples.replace("\"", "\'") result = None error = None response_status = 200 # get the min selected Level if aggregated at multiple levels qryStr = "MATCH (s:Sample)-[:COUNT]->(f:Feature)<-[:LEAF_OF]-(:Feature)<-[:PARENT_OF*]-(:Feature)<-[:DATASOURCE_OF]-(ds:Datasource {label: '" + in_datasource + "'}) RETURN f.depth as depth LIMIT 1" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) minSelectedLevel = int(df['depth'].values[0]) if minSelectedLevel is None: minSelectedLevel = 6 for level in in_params_selectedLevels.keys(): if in_params_selectedLevels[level] == 2 and int(level) < minSelectedLevel: minSelectedLevel = int(level) # user selection nodes for custom aggregation - decides the cut selNodes = "[" selFlag = 0 for node in in_params_selection.keys(): if in_params_selection[node] == 2: selNodes += "'" + node + "'," selFlag = 1 if selFlag == 1: selNodes = selNodes[:-1] selNodes += "]" except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(error_info[2]) response_status = 500 return result, error, response_status qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'}) " \ "MATCH (ds)-[:DATASOURCE_OF]->(:Feature)-[:PARENT_OF*]->(f:Feature) MATCH (f)-[:LEAF_OF]->()<-[v:COUNT]-(s:Sample)" \ "USING INDEX s:Sample(id) WHERE (f.depth=" + str(minSelectedLevel) + " OR f.id IN " + selNodes + ") AND " \ "(f.start >= " + in_params_start + " AND " \ "f.end <= " + in_params_end + ") AND s.id IN " + tick_samples + " with distinct f, s, SUM(v.val) as agg " \ "RETURN distinct agg, s.id, f.label as label, f.leafIndex as index, f.end as end, f.start as start, " \ "f.id as id, f.lineage as lineage, f.lineageLabel as lineageLabel, f.order as order" try: rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) if len(df) > 0: # change column type df['index'] = df['index'].astype(int) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) df['order'] = df['order'].astype(int) # update order based on req for key in in_params_order.keys(): df.loc[df['id'] == key, 'order'] = in_params_order[key] for key in in_params_selection.keys(): lKey = key.split('-') if int(lKey[0]) <= minSelectedLevel: if in_params_selection[key] == 0: # user selected nodes to ignore! df = df[~df['lineage'].str.contains(key)] elif in_params_selection[key] == 2: df = df[~(df['lineage'].str.contains(key) & ~df['id'].str.contains(key))] # create a pivot_table where columns are samples and rows are features # for pandas > 0.17 df_pivot = pandas.pivot_table(df, index=["id", "label", "index", "lineage", "lineageLabel", "start", "end", "order"], columns="s.id", values="agg", fill_value=0).sortlevel("index") cols = {} for col in df_pivot: cols[col] = df_pivot[col].values.tolist() rows = {} rows['metadata'] = {} metadata_row = ["end", "start", "index"] for row in df_pivot.index.names: if row in metadata_row: rows[row] = df_pivot.index.get_level_values(row).values.tolist() else: rows['metadata'][row] = df_pivot.index.get_level_values(row).values.tolist() result = {"cols": cols, "rows": rows, "globalStartIndex": (min(rows['start']))} else: cols = {} samples = eval(in_params_samples) for sa in samples: cols[sa] = [] rows = { "end": [], "start": [], "index": [], "metadata": {} } result = {"cols": cols, "rows": rows, "globalStartIndex": None} except: error_info = sys.exc_info() error = str(error_info[0]) + " " + str(error_info[1]) + " " + str(error_info[2]) response_status = 500 return result, error, response_status
def get_data(in_datasource): """ This function returns the set of all samples in the database. The first cypher query is finding all samples in the database. The second cypher query is used to find the mix and max count value for all features across all samples. This is return along with data source information including name and taxonomic hierarchy level names. Args: in_datasource: namespace to query Returns: result: Sample nodes information in database """ qryStr = "MATCH (ds:Datasource {label: '" + in_datasource + "'})-[:DATASOURCE_OF]->()-[LEAF_OF]->()<-[:COUNT]-(s:Sample)" \ "RETURN DISTINCT ds,s" rq_res = utils.cypher_call(qryStr) df = utils.process_result(rq_res) measurements = [] anno = [] df.fillna(0, inplace=True) dsGroup = [] dsId = [] dsDescription = [] for index, row in df.iterrows(): temp = row['s'] measurements.append(temp['id']) del temp['id'] anno.append(temp) dsGroup.append(row['ds']['label']) dsId.append(row['ds']['label']) dsDescription.append(row['ds']['description']) rowQryStr = "MATCH ()-[r]-() WHERE EXISTS(r.val) RETURN min(r.val) as minVal, max(r.val) as maxVal" rq_res2 = utils.cypher_call(rowQryStr) df2 = utils.process_result(rq_res2) result = { "id": measurements, "name": measurements, "datasourceGroup": dsGroup, "datasourceId": dsId, "datasourceDescription": dsDescription, "defaultChartType": "", "type": "feature", "minValue": df2['minVal'][0], "maxValue": df2['maxVal'][0], "annotation": anno, "metadata": [ "label", "id", "taxonomy1", "taxonomy2", "taxonomy3", "taxonomy4", "taxonomy5", "taxonomy6", "taxonomy7", "lineage" ] } return result