def GSLR(pathway_id_and_filepath_and_nodes_and_edges_and_costs):

    pathway_id, filepath, nodes, edges, costs = pathway_id_and_filepath_and_nodes_and_edges_and_costs

    # we had done dataset.to_csv(filename, index=True, header=True)
    dataset = pd.read_csv(filepath, index_col=0)
    y = LabelEncoder().fit_transform(dataset.index.tolist())

    dataset = dataset.transpose().reindex(index=nodes).transpose()
    X = dataset.values

    d = len(nodes)
    c = 2

    graph_opts = gslr.GraphOptions(edges=edges,
                                   root=-1,
                                   num_clusters=1,
                                   pruning='strong')

    sparsity_low = 50
    sparsity_high = 100

    verbosity_level = 0

    num_steps = 35
    possible_steps = np.array([0.03, 0.1, 0.3])
    steps = np.tile(possible_steps, (num_steps, 1))

    features = []
    accuracies = []

    for train, test in StratifiedKFold(n_splits=10).split(X, y):

        W0 = np.zeros((c, d))

        W_hat, losses = gslr.gslr(X[train],
                                  y[train],
                                  W0,
                                  sparsity_low,
                                  sparsity_high,
                                  graph_opts,
                                  steps,
                                  verbosity_level,
                                  edge_costs=costs,
                                  edge_costs_multiplier=2)

        yhat = gslr.predict(X[test], W_hat)
        num_cor = gslr.num_correct(y[test], yhat)
        accuracy = num_cor / float(len(test))

        features.append(W_hat[0])
        accuracies.append(accuracy)

    features = pd.DataFrame(features, columns=dataset.columns)
    features = features.columns[(features != 0).any()].tolist()

    return pathway_id, accuracies, features
示例#2
0
def GSLR(pathway_id_and_filepath_and_nodes_and_edges_and_costs_and_low_and_high):

	pathway_id, filepath, nodes, edges, costs, sparsity_low, sparsity_high = pathway_id_and_filepath_and_nodes_and_edges_and_costs_and_low_and_high

	print()
	print('-----------------')
	print(pathway_id)
	print(str(sparsity_low)+'-'+str(sparsity_high))
	print()

	# we had done dataset.to_csv(filename, index=True, header=True)
	dataset = pd.read_csv(filepath, index_col=0)
	y = LabelEncoder().fit_transform(dataset.index.tolist())

	dataset = dataset.transpose().reindex(index=nodes).transpose()
	X = dataset.values

	d = len(nodes)
	c = 2

	graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong')

	verbosity_level = 1

	num_steps = 35
	possible_steps = np.array([0.03,0.1,0.3])
	steps = np.tile(possible_steps, (num_steps, 1))

	featuresets = []
	accuracies = []

	for train, test in StratifiedKFold(n_splits=10).split(X, y):

		print()
		print('fold')
		print()

		W0 = np.zeros((c, d))

		W_hat, losses = gslr.gslr(X[train], y[train], W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=costs, edge_costs_multiplier=2)

		yhat = gslr.predict(X[test], W_hat)
		num_cor = gslr.num_correct(y[test], yhat)
		accuracy = num_cor / float(len(test))
		accuracies.append(accuracy)

		features = pd.DataFrame(W_hat, columns=dataset.columns)
		features = features.columns[(features != 0).any()].tolist()
		featuresets.append(features)


	return pathway_id, (sparsity_low, sparsity_high), accuracies, featuresets
示例#3
0
def GSLR(filepath_and_pathway_ids_and_nodes_and_edges):

	filepath, pathway_id_1, pathway_id_2, nodes, edges = filepath_and_pathway_ids_and_nodes_and_edges

	# we had done dataset.to_csv(filename, index=True, header=True)
	dataset = pd.read_csv(filepath, index_col=0)
	y = LabelEncoder().fit_transform(dataset.index.tolist())

	dataset = dataset.transpose().reindex(index=nodes).transpose()
	X = dataset.values

	d = len(nodes)
	c = 2

	graph_opts = gslr.GraphOptions(edges=edges, root=-1, num_clusters=1, pruning='strong')

	sparsity_low = 30
	sparsity_high = 70

	verbosity_level = 0

	num_steps = 50
	possible_steps = np.array([0.1,0.2])
	steps = np.tile(possible_steps, (num_steps, 1))

	features = []
	accuracies = []

	for train, test in StratifiedKFold(n_splits=10).split(X, y):

		W0 = np.zeros((c, d))

		W_hat, losses = gslr.gslr(X[train], y[train], W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level)

		yhat = gslr.predict(X[test], W_hat)
		num_cor = gslr.num_correct(y[test], yhat)
		accuracy = num_cor / float(len(test))

		features.append(W_hat[0])
		accuracies.append(accuracy)

	features = pd.DataFrame(features, columns=dataset.columns)
	features = features.columns[(features != 0).any()].tolist()

	return pathway_id_1, pathway_id_2, accuracies, features
示例#4
0
	sparsity_low = 150
	sparsity_high = 400

	verbosity_level = 1

	num_steps = 100
	possible_steps = np.array([0.03, 0.1, 0.3])
	steps = np.tile(possible_steps, (num_steps, 1))

	W0 = np.zeros((c, d))

	W_hat, losses = gslr.gslr(X, y, W0, sparsity_low, sparsity_high, graph_opts, steps, verbosity_level, edge_costs=inbiomap_experimentally.cost.values, edge_costs_multiplier=6)

	yhat = gslr.predict(X, W_hat)
	num_cor = gslr.num_correct(y, yhat)

	return num_cor, W_hat, losses


if __name__ == "__main__":


	### I. Load Ovarian Cancer Proteomics Dataset

	# medullo = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/medullo_inbiomap_exp.tsv', index_col=0)
	dataset = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/ovarian_inbiomap_exp.tsv', index_col=0)
	# brca = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/brca_inbiomap_exp.tsv', index_col=0)

	# medullo_labels = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/raw/medullo_labels.csv', index_col=0)
	labels = pd.read_csv('/Users/alex/Documents/proteomics/data_preparation/proteomics_data/raw/ovarian_labels.csv', index_col=0)