예제 #1
0
def answer_four():
    import pandas as pd
    G1 = answer_three()
    G2 = nx.read_edgelist('Employee_Relationships.txt',
                          delimiter='	',
                          data=[('Score', int)])
    df1 = nx.to_pandas_dataframe(G1)
    df2 = nx.to_pandas_dataframe(G2, weight='Score')
    data = pd.DataFrame(
        columns=['E1', 'E2', 'Relationship_Score', 'Nb_Common_Movies'
                 ]).set_index(['E1', 'E2'])
    employees_list = list(employees)
    for i in range(0, len(employees_list)):
        for j in range(i + 1, len(employees_list)):
            data.loc[(employees_list[i], employees_list[j]),
                     'Nb_Common_Movies'] = df1.loc[employees_list[i],
                                                   employees_list[j]]
            data.loc[(employees_list[i], employees_list[j]),
                     'Relationship_Score'] = df2.loc[employees_list[i],
                                                     employees_list[j]]

    data['Nb_Common_Movies'] = data['Nb_Common_Movies'].astype(float)
    data['Relationship_Score'] = data['Relationship_Score'].astype(float)
    return data['Relationship_Score'].corr(data['Nb_Common_Movies'],
                                           method='pearson')
예제 #2
0
def prune_homology_graph(df, chim_dir):
    to_remove = []
    df['brk_left_cut'] = df['name'].str.split(":").str[0:3].str.join(sep=":")
    df['brk_right_cut'] = df['name'].str.split(":").str[3:6].str.join(sep=":")
    left_nodes = set(df[df['brk_left_cut'].duplicated()]['brk_left_cut'])
    right_nodes = df[df['brk_right_cut'].duplicated()]['brk_right_cut']
    all_nodes = list(zip(left_nodes, itertools.repeat("left"))) + list(
        zip(right_nodes, itertools.repeat("right")))
    for node, hom_side in all_nodes:
        node_members = df[((df['brk_' + hom_side + '_cut'] == node))]['name']
        node_graph = nx.Graph()
        node_graph.add_nodes_from(node_members, exprs=10)
        for jxn1, jxn2 in itertools.combinations(node_members, 2):
            pair_score = get_pairwise_hom(jxn1, jxn2, chim_dir, hom_side)
            if pair_score != 0:
                node_graph.add_edge(jxn1, jxn2, weight=pair_score)
        # nx.draw_networkx(node_graph, pos=nx.shell_layout(node_graph), node_size=100)
        # plt.show()
        adj_mat = nx.to_pandas_dataframe(node_graph)
        node_compare = adj_mat[adj_mat.sum() > 0].index.tolist()
        if len(node_compare) > 0:
            node_homdf = df[df['name'].isin(node_compare)][[
                'name', 'TPM_Fusion', 'TPM_Left', 'TPM_Right'
            ]].set_index('name')
            node_homdf['max_pairs'] = node_homdf[['TPM_Left',
                                                  'TPM_Right']].max(axis=1)
            node_homdf = node_homdf.sort_values(['TPM_Fusion', 'max_pairs'],
                                                ascending=False)
            node_remove = node_homdf.iloc[1:].index.tolist()
            to_remove.extend(node_remove)
    # use list of to_remove to mark homologous fusions
    return to_remove
def answer_four():
        
    # Your Code Here
    
    # load graph
    G = answer_three()
    
    # extract employee edge weights from graph into dataframe
    employee_weights = nx.to_pandas_dataframe(G)
    
    # load the text file into a dataframe
    df = pd.read_csv('Employee_Relationships.txt', sep='\t', header = None)
    
    # add column names to the dataframe
    df.columns = ['Employee_1', 'Employee_2','Frienship_score']
    
    # add dummy column for movie weight for movies in common
    df['Movie_score'] = 0
    
    # add employee weights to the friendship score dataframe
    df['Movie_score'] =  employee_weights.lookup(df['Employee_1'], df['Employee_2'])
    
    # calcualte the Pearson correlation
    Pearson_corr = df['Frienship_score'].corr(df['Movie_score'])
            
    return Pearson_corr  # Your Answer Here
예제 #4
0
def maximum_spanning_tree(table, undirected=False):
    sys.stderr.write("Calculating MST score...\n")
    table = table.copy()
    table["distance"] = 1.0 / table["nij"]
    G = nx.from_pandas_dataframe(table,
                                 source="src",
                                 target="trg",
                                 edge_attr=["distance", "nij"])
    T = nx.minimum_spanning_tree(G, weight="distance")
    table2 = pd.melt(nx.to_pandas_dataframe(T, weight="nij").reset_index(),
                     id_vars="index")
    table2 = table2[table2["value"] > 0]
    table2.rename(columns={
        "index": "src",
        "variable": "trg",
        "value": "cij"
    },
                  inplace=True)
    table2["score"] = table2["cij"]
    table = table.merge(table2, on=["src", "trg"])
    if undirected:
        table["edge"] = table.apply(
            lambda x: "%s-%s" %
            (min(x["src"], x["trg"]), max(x["src"], x["trg"])),
            axis=1)
        table = table.drop_duplicates(subset=["edge"])
        table = table.drop("edge", 1)
    return table[["src", "trg", "nij", "score"]]
예제 #5
0
파일: sna.py 프로젝트: kerstinopen/repoSNA
def graph_to_pandas_time_series(graph):
    """
    Transform a graph into a pandas time series dataframe.
    """

    time_dataframe = nx.to_pandas_dataframe(graph)

    return
def CalcularAdyacencia (Grafo, fileName):

    pathFile = os.getcwd() + "\\" + fileName
    if path.exists(pathFile):
        os.remove(pathFile)

    df = nx.to_pandas_dataframe(Grafo)
    df.to_csv(pathFile, header=None, index=None, mode='w', sep='\t')
예제 #7
0
 def test_from_adjacency(self):
     nodelist = [1, 2]
     dftrue = pd.DataFrame([[1, 1], [1, 0]], dtype=int, index=nodelist, columns=nodelist)
     G = nx.Graph([(1, 1), (1, 2)])
     df = nx.to_pandas_adjacency(G, dtype=int)
     pd.testing.assert_frame_equal(df, dftrue)
     # deprecated
     df = nx.to_pandas_dataframe(G, dtype=int)
     pd.testing.assert_frame_equal(df, dftrue)
예제 #8
0
 def A(self, t, dtype="numpy"):
     nodelist = self.get_living(t, indices_only=True)
     if dtype == "sparse":
         return nx.adjacency_matrix(self, nodelist)
     elif dtype == "numpy":
         return np.array(nx.to_numpy_matrix(self, nodelist))
     elif dtype == "pandas":
         return nx.to_pandas_dataframe(self, nodelist)
     else:
         raise ValueError("Unknown dtype")
def answer_four():
    df = nx.to_pandas_dataframe(answer_three())
    
    dfl = []
    for i in range(len(rdata)):
        dfl.append(df.loc[rdata.iloc[i][0]][rdata.iloc[i][1]])
        
    rdata[3] = dfl
    
    return (rdata.corr()).iloc[0][3]
    
def answer_four():
        
    df = pd.read_table("Employee_Relationships.txt",header=None)   
    df.columns = ["one","two","relation"]
    g = answer_two()
    P = answer_three()
    
    df1 = nx.to_pandas_dataframe(P)
    df["common"] = df.apply(lambda x: df1.loc[x[0],x[1]],axis=1)
    correlation = df.corr(method='pearson')
    
    return correlation.iloc[0,1]
예제 #11
0
    def add_edge_table(self, weight="weight"):
        adj_df = nx.to_pandas_dataframe(self.network,
                                        weight=weight,
                                        nonedge=np.nan)
        edge_table = melt_upper_triu(adj_df)
        edge_table = edge_table.loc[pd.notnull(edge_table)].reset_index()
        edge_table.columns = ['Gene1', 'Gene2', weight]
        edge_table[['Gene1', 'Gene2']] = (edge_table[[
            'Gene1', 'Gene2'
        ]].applymap(lambda x: self.node_2_name.get(x, x)))

        return edge_table
예제 #12
0
def doubly_stochastic(table, undirected=False, return_self_loops=False):
    sys.stderr.write("Calculating DST score...\n")
    table = table.copy()
    table2 = table.copy()
    original_nodes = len(set(table["src"]) | set(table["trg"]))
    table = pd.pivot_table(table,
                           values="nij",
                           index="src",
                           columns="trg",
                           aggfunc="sum",
                           fill_value=0)
    row_sums = table.sum(axis=1)
    attempts = 0
    while np.std(row_sums) > 1e-12:
        table = table.div(row_sums, axis=0)
        col_sums = table.sum(axis=0)
        table = table.div(col_sums, axis=1)
        row_sums = table.sum(axis=1)
        attempts += 1
        if attempts > 1000:
            warnings.warn(
                "Matrix could not be reduced to doubly stochastic. See Sec. 3 of Sinkhorn 1964",
                RuntimeWarning)
            return pd.DataFrame()
    table = pd.melt(table.reset_index(), id_vars="src")
    table = table[table["src"] < table["trg"]]
    table = table[table["value"] > 0].sort_values(by="value", ascending=False)
    i = 0
    G = nx.Graph()
    while nx.number_connected_components(G) != 1 or nx.number_of_nodes(
            G) < original_nodes:
        edge = table.iloc[i]
        G.add_edge(edge["src"], edge["trg"], weight=edge["value"])
        i += 1
    table = pd.melt(nx.to_pandas_dataframe(G).reset_index(), id_vars="index")
    table = table[table["value"] > 0]
    table.rename(columns={
        "index": "src",
        "variable": "trg",
        "value": "cij"
    },
                 inplace=True)
    table["score"] = table["cij"]
    table = table.merge(table2[["src", "trg", "nij"]], on=["src", "trg"])
    if not return_self_loops:
        table = table[table["src"] != table["trg"]]
    if undirected:
        table = table[table["src"] <= table["trg"]]
    return table[["src", "trg", "nij", "score"]]
def airport_log_flow(data):
    import networkx as nx
    import seaborn as sb
    matrix=data[["log_PAX","Departure","Arrival"]]
    group=matrix.groupby(['Departure', 'Arrival'],as_index=False).mean()
    G=nx.Graph()
    for i in range(126):
        G.add_edge(group["Departure"][i],group["Arrival"][i],weight=group["log_PAX"][i])
        
    adjacency_matrix=nx.to_pandas_dataframe(G)
    
    plt.figure(figsize=(15,15))
    sb.heatmap(adjacency_matrix,cmap="OrRd")
        
    
예제 #14
0
def airport_log_flow(data):
    import networkx as nx
    import seaborn as sb
    matrix = data[["log_PAX", "Departure", "Arrival"]]
    group = matrix.groupby(['Departure', 'Arrival'], as_index=False).mean()
    G = nx.Graph()
    for i in range(126):
        G.add_edge(group["Departure"][i],
                   group["Arrival"][i],
                   weight=group["log_PAX"][i])

    adjacency_matrix = nx.to_pandas_dataframe(G)

    plt.figure(figsize=(15, 15))
    sb.heatmap(adjacency_matrix, cmap="OrRd")
예제 #15
0
def answer_four():
    
    G = pd.read_csv('Employee_Movie_Choices.txt', sep = '\t', skiprows = 1, names = ['Employee', 'Movie'])
    G1 = nx.from_pandas_dataframe(G, 'Employee', 'Movie')
    
    G1.add_nodes_from(employees, type = 'employee', bipartite = 0)
    G1.add_nodes_from(movies, type = 'movie', bipartite = 1)
    
    X = set(employees)
    P = bipartite.weighted_projected_graph(G1, X)
    
    G2 = nx.to_pandas_dataframe(P)
    
    H = pd.read_csv('Employee_Relationships.txt', sep = '\t', header = None, names = ['Employee', 'Workmate', 'Relation_rating'])
    H['Shared_movies'] = G2.lookup(H['Employee'], H['Workmate'])
    
    return H['Relation_rating'].corr(H['Shared_movies'])
예제 #16
0
def q4(P: nx.Graph):
    '''
    P: weighted projection graph which tells us how many movies different pairs of employees have in common.
    '''
    rel_df = pd.read_csv('Employee_Relationships.txt',
                         delim_whitespace=True,
                         header=None,
                         names=['n1', 'n2', 'weight'])
    emp_df = nx.to_pandas_dataframe(P).unstack().reset_index().query(
        'level_0 != level_1').reset_index(drop=True)
    df = pd.merge(rel_df,
                  emp_df,
                  how='left',
                  right_on=['level_0', 'level_1'],
                  left_on=['n1', 'n2']).loc[:, ['weight', 0]]
    df.columns = ['relationship_score', 'num_movies_common']
    return df.corr().iloc[0, 1]
예제 #17
0
def blockmodel_output(G, t=1.15):
    # Makes life easier to have consecutively labeled integer nodes
    H = nx.convert_node_labels_to_integers(G, label_attribute='label')
    """Creates hierarchical cluster of graph G from distance matrix"""
    # Create distance matrix
    path_length = dict(nx.all_pairs_shortest_path_length(H))
    distances = np.zeros((len(H), len(H)))
    for u, p in path_length.items():
        for v, d in p.items():
            distances[u][v] = d
    # Create hierarchical cluster
    Y = distance.squareform(distances)
    Z = hierarchy.complete(Y)  # Creates HC using farthest point linkage
    # This partition selection is arbitrary, for illustrative purposes
    membership = list(hierarchy.fcluster(Z, t=t))
    # Create collection of lists for blockmodel
    partitions = defaultdict(list)
    for n, p in zip(list(range(len(G))), membership):
        partitions[p].append(n)

    # Build blockmodel graph
    #BM = nx.blockmodel(H, partitions) # change in nx 2.0
    p_values = list(partitions.values())
    BM = nx.quotient_graph(H, p_values, relabel=True)

    label_dict = dict([(n, H.node[n]['label']) for n in H])
    order = [label_dict[item] for sublist in p_values for item in sublist]
    nm = nx.to_pandas_dataframe(G)
    nm = nm.reindex(index=order)
    nm.columns = nm.index

    ho = homophily(G, 'type')

    output = {
        'G': G,
        'H': H,
        'partitions': partitions,
        'BM': BM,
        'nm': nm,
        'label_dict': label_dict,
        'order': order,
        'distances': distances
    }
    output.update(ho)
    return output
예제 #18
0
def answer_four():

    # Your Code Here
    df = pd.read_csv('Employee_Relationships.txt', delimiter="\t", header=None)
    df.rename(columns={
        0: 'employee1',
        1: 'employee2',
        2: 'score'
    },
              inplace=True)
    #df['movies_in_common'] = None
    #df_2 = pd.read_csv('Employee_Movie_Choices.txt', delimiter = "\t")
    df_1 = nx.to_pandas_dataframe(answer_three())
    df['shared_movies'] = df_1.lookup(df['employee1'], df['employee2'])
    #df_3 = df_2.groupby('#Employee')['Movie'].apply(list)
    #df_3 = df_3.reset_index()
    #my_list = []
    #for x in df_3["Movie"]:
    #for y in df_3["Movie"]:
    #if x != y:
    #my_list.append(len(set(x) & set(y)))
    #final = pd.merge(df,df_3,on='employee1')
    corr_P = df['score'].corr(df['shared_movies'], method='pearson')
    return corr_P  #final#.agg({"review_scores_value":np.average})# Your Answer Here
예제 #19
0
 def network_to_pandas(self):
     """
     Returns network in pandas format
     """
     return nx.to_pandas_dataframe(self.g)
import networkx as nx
import pandas as pd
from networkx.algorithms import bipartite

# df = pd.read_excel("FKT Data/fkt_cooccurrence.xlsx")
# G = nx.from_pandas_dataframe(df, 'Booking_Service_Id', 'Name_Product')
# W = bipartite.weighted_projected_graph(G, df['Name_Product'].unique())
# X = nx.to_pandas_dataframe(W)

df = pd.read_excel("co-occurrence.xlsx")
df = df[~df.CATEGORY_PRODUCT.isnull()]
G = nx.from_pandas_dataframe(df, 'BOOKING_ID', 'CATEGORY_PRODUCT')
W = bipartite.weighted_projected_graph(G, df['CATEGORY_PRODUCT'].unique())
X = nx.to_pandas_dataframe(W)
X.to_excel("product_pairing.xlsx")
예제 #21
0
def graph_centrality(df,
                     cent_type='betweenness',
                     keep_thresh=0.5,
                     cond_type='add_one',
                     corr_type='spearman',
                     weighted=False,
                     corr_dir='none'):
    '''

    :param df: @type pandas DataFrame
    :param cent_type: @type string - valid values: betweenness, degree, closeness, eigenvector
    :param keep_thresh: @type float - default 0.5
    :param cond_type: @type: string - valid values: add_one, hellinger
    :param corr_type: @type: string - valid values: spearman, kendall, pearson, MIC
    :param weighted: @type: boolean - True if you want to produce a graph with weighted edges, False otherwise
    :param corr_dir: @type: string - valid values: none, positive, negative
    :return:
    '''
    data = df.copy()
    conditioned_df = condition(data, cond_type)  # condition data
    w_corr_df = find_correlation(conditioned_df, corr_type)
    if corr_dir == 'positive':
        w_corr_df_b = 1 - w_corr_df.copy(
        )  #only keep strong positive correlations (small positive numbers)
    elif corr_dir == 'negative':
        w_corr_df_b = 1 + w_corr_df.copy(
        )  # only keep strong negative correlations (small negative numbers)
    else:
        w_corr_df_b = 1 - abs(w_corr_df.copy(
        ))  # keep both strong positive and negative correlations
    w_corr_df_b[(
        w_corr_df_b >= 1 - keep_thresh
    )] = 1  # set anything greater than the threshold value to 1 so we can remove it.
    labels = list(w_corr_df_b.index)
    temp = abs(w_corr_df_b.copy())
    temp.insert(0, 'var1', labels)

    if weighted == True:
        attr = 'weight'
    else:
        attr = 'edge'

    df_b = pd.melt(temp, 'var1', var_name='var2', value_name=attr)
    df_b = df_b.loc[(
        (df_b[attr] <= 1 - keep_thresh) &
        (df_b[attr] > 0.0)), :]  # take only those edge pairs that made the cut
    df_g = networkx.from_pandas_dataframe(df_b, 'var1', 'var2',
                                          attr)  # takes a list of valid edges
    networkx.write_graphml(df_g, 'graph.graphml')
    #networkx.draw(df_g, with_labels=True)
    #networkx.draw(df_g)
    #pylab.show()

    #print('adjacency matrix', networkx.to_pandas_dataframe(df_g))
    am = networkx.to_pandas_dataframe(df_g)
    am.to_csv('adj_matrix.csv')

    if cent_type == 'betweenness':
        centrality = networkx.betweenness_centrality(df_g)
    elif cent_type == 'degree':
        centrality = networkx.degree_centrality(df_g)
    elif cent_type == 'closeness':
        centrality = networkx.closeness_centrality(df_g)
    elif cent_type == 'eigenvector':
        centrality = networkx.eigenvector_centrality(df_g)
    else:
        print('error, unknown centrality')
        return -1

    centrality_df = pd.DataFrame.from_dict(centrality, orient='index')
    centrality_df.columns = ['metric']

    if not centrality_df.empty:
        centrality_df = centrality_df[centrality_df.ix[:, 0] > 0]

    if not centrality_df.empty:
        centrality_df.sort_values('metric',
                                  axis=0,
                                  ascending=False,
                                  inplace=True)
    '''fig = plt.figure()
    plt.hist(centrality_df, bins=20)
    plt.xlabel('Centrality')
    plt.ylabel('Frequency')
    plt.title('Graph Centrality Distribution')
    plt.tight_layout()
    #fig.savefig('test.jpg')
    plt.show()'''

    return centrality_df
예제 #22
0
	elif edgeMat.columns[i] in G4:
		groundTruth[i] = 3
	elif edgeMat.columns[i] in G5:
		groundTruth[i] = 4
	elif edgeMat.columns[i] in G6:
		groundTruth[i] = 5
	else:
		groundTruth[i] = 6
for kClusters in range(8):
	clusteringAlgorithms(kClusters+4,groundTruth,edgeMat,pos,FG)
'''
#HERE ending comments

#Clustering for the cleared FG
kClusters = 0
edgeMat = nx.to_pandas_dataframe(FGcleared)
groundTruth = edgeMat.values.tolist()
pos = nx.spring_layout(FGcleared)
for i in range(len(groundTruth)):
	if edgeMat.columns[i] in G1:
		groundTruth[i] = 0
	elif edgeMat.columns[i] in G2:
		groundTruth[i] = 1
	elif edgeMat.columns[i] in G3:
		groundTruth[i] = 2
	elif edgeMat.columns[i] in G4:
		groundTruth[i] = 3
	elif edgeMat.columns[i] in G5:
		groundTruth[i] = 4
	else:
		groundTruth[i] = 5
예제 #23
0
    np.unique(ix_a).shape
    # Num groups: 2654509

    return ix_a, _a


"""c"""

a = np.column_stack((i_a0, i_a1))

G = nx.Graph()

G.add_edges_from(a)
G.add_nodes_from(i_ex0)

a = nx.to_pandas_dataframe(G)
# Number of nodes: 5654509

nx.number_connected_components(G)
# 2654509

# Fast:

l_out0 = []

iCount = 100000

for x in nx.connected_components(G):
    l_out0.append(x)
    iCount = iCount - 1
    if iCount == 0:
예제 #24
0
                       node_color = values, node_size = 500)
nx.draw_networkx_labels(G, pos)
options = {
    'node_color': 'blue',
    'node_size': 700,
    'width': 2,
    'arrowstyle': '-|>',
    'arrowsize': 7,
}
nx.draw_networkx_edges(G, pos, edgelist=red_edges, edge_color='r', arrows=True)
nx.draw_networkx_edges(G, pos, edgelist=black_edges, arrows=True, **options)
nx.draw_networkx(G, pos, edgelist=black_edges, arrows=True, **options)
plt.show()

print len(G.adjacency_list())
print len(G.nodes())
nx.to_edgelist(G)
nx.to_dict_of_dicts(G)
print nx.to_dict_of_lists(G)
adjacentMat = nx.to_pandas_dataframe(G)  # shows the existing directions
print adjacentMat
col = adjacentMat['F']
row = adjacentMat.loc['F']
print col
print row
totalNeighbours = col + row

G.neighbors('F')
G.predecessors('F')
nx.predecessor(G,'F')
G.successors('F')
예제 #25
0
                (pos + 1):], np.concatenate(
                    (t['weight'][:pos], t['weight'][(pos + 1):]))
            seq.insert(0, current)
        else:
            pos = t['target'].index(end)
            end = current
            end_t, end_d = t['target'][:pos] + t['target'][
                (pos + 1):], np.concatenate(
                    (t['weight'][:pos], t['weight'][(pos + 1):]))
            seq.append(current)
        targets = start_t + end_t
        degree = np.concatenate((start_d, end_d))
    return seq


g_ppi2 = nx.read_gpickle('D:/PPI-Topic/Processed_data/g_ppi_newdeg.csv')

di_ppi2 = nx.to_dict_of_lists(g_ppi2)
c_ppi2 = nx.to_pandas_dataframe(g_ppi2)
seq = []
net_ppi2 = transform(di_ppi2, c_ppi2)
for node in g_ppi2.nodes():
    for i in range(WALK_TIME):
        seq.append(random_walk(node, net_ppi2) + ['#'])
    # print('node',node,'complete')
lexico = sorted(g_ppi2.nodes(), key=g_ppi2.degree, reverse=True)
lexico1 = {e: lexico.index(e) for e in lexico}
lexico1['#'] = -1
seq_concat = list(itertools.chain.from_iterable(seq))
seq_int = [lexico1[e] for e in seq_concat]