示例#1
0
def export_KG(G, filename):

    #-
    cores = nx.onion_layers(G.to_undirected())
    ranks = nx.pagerank(G)
    nx.set_node_attributes(G, cores, "core")
    nx.set_node_attributes(G, ranks, "pagerank")

    #-
    node_colors = []
    node_sizes = []
    node_attri = {}
    pos_attrs = {}
    pos_nodes = nx.kamada_kawai_layout(G)
    for ndx, coords in pos_nodes.items():
        pos_attrs[ndx] = (coords[0] + 0.00, coords[1] + 0.0)
        node = G.nodes[ndx]
        node_colors.append(node["core"])
        node_sizes.append(node["pagerank"])
        if "index" in node:
            node_attri[ndx] = "{}:'{}'".format(node["index"], node["token"])
        else:
            node_attri[ndx] = ""
    node_sizes = np.array(node_sizes)
    node_sizes = (node_sizes - node_sizes.min()) / (node_sizes.max() -
                                                    node_sizes.min())
    node_sizes *= 150
    node_colors = np.array(node_colors)
    node_colors = (node_colors - node_colors.min()) / (node_colors.max() -
                                                       node_colors.min())
    node_colors = [plt.cm.rainbow(x) for x in node_colors]

    edges_width = []
    edges = list(G.edges(data=True))
    for i in range(len(edges)):
        if "weight" in edges[i][2]:
            edges_width.append(edges[i][2]["weight"])
        else:
            edges_width.append(0.1)
    edges_width = np.array(edges_width)
    edges_width = (edges_width - edges_width.min()) / (edges_width.max() -
                                                       edges_width.min())

    #-
    nx.draw(G,
            pos_nodes,
            arrowsize=3,
            width=edges_width,
            node_size=node_sizes,
            node_color=node_colors)
    nx.draw_networkx_labels(G,
                            pos_attrs,
                            labels=node_attri,
                            font_weight='bold',
                            font_size=8)
    plt.savefig(filename, bbox_inches='tight')
    plt.clf()
示例#2
0
 def test_onion_layers(self):
     layers = nx.onion_layers(self.G)
     nodes_by_layer = [
         sorted(n for n in layers if layers[n] == val) for val in range(1, 7)
     ]
     assert nodes_equal(nodes_by_layer[0], [21])
     assert nodes_equal(nodes_by_layer[1], [17, 18, 19, 20])
     assert nodes_equal(nodes_by_layer[2], [10, 12, 13, 14, 15, 16])
     assert nodes_equal(nodes_by_layer[3], [9, 11])
     assert nodes_equal(nodes_by_layer[4], [1, 2, 4, 5, 6, 8])
     assert nodes_equal(nodes_by_layer[5], [3, 7])
示例#3
0
    return (silh.mean(), n, labels, silh, pipe)

core = nx.k_core(nx.Graph(G))

# Capitalize all occurrences of keywords for easy display on the output
pattern = re.compile(f"\\b({tz.pipe(keywords, tz.pluck(0), '|'.join)})\\b")  # TODO, make matching case insensitive
nice_pars = nice_pars.apply(lambda x: re.sub(pattern, lambda m: m.group().upper(), x))  # TODO, add [[]] around our keywords

core_pars = np.array(nice_pars)[core.nodes]
core_vecs = vecs[core.nodes]

sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8)

len(lab), len(sil)

layers = nx.onion_layers(core)

len(core.nodes)

# TODO, drop items of silhouette <= 0
df = pd.DataFrame(data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)])
df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1)

for cluster_id in df['Cluster ID'].unique():
    df = df.append({"Label": cluster_id, "Cluster ID": NAME_OF_TEXT, "Silhouette Score": None}, ignore_index=True)
else:
    df = df.append({"Label": NAME_OF_TEXT, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True)

df.to_csv("out.csv", index=False)
print()
示例#4
0
文件: main.py 项目: mhlr/cartographer
def main(args):
    name_of_pdf_dir = os.path.basename(args.directory_with_pdfs)

    all_text = get_all_pdf_text_concatenated(args.directory_with_pdfs)

    pars = pd.Series(all_text.split('\n\n')).str.replace('\n', ' ')

    pars.str.len().apply(lambda x: np.log2(x + 1)).astype(int).value_counts()  # TODO, is this being stored anywhere?

    text_keywords = keywords(all_text, scores=True, lemmatize=True, words=args.num_keywords)

    lower_bound_chars, upper_bound_chars = args.lower_bound_chars, args.upper_bound_chars
    word_count = int((lower_bound_chars + upper_bound_chars) / (2 * (avg_word_len + 1)))
    lens = pars.str.len()  # paragraph lengths
    nice_pars = pars[(lens >= lower_bound_chars)]  # paragraphs we want to use

    nice_pars = nice_pars.apply(
        partial(text_reduce_return,
                upper_bound_chars=upper_bound_chars, max_word_count=word_count)
    )

    vecs = emb(tuple(nice_pars), args.tfhub_sentence_encoder_url).numpy()

    D = sk.metrics.pairwise_distances(vecs, metric='cosine')  # pairwise distances of vectors
    R = scipy.sparse.csgraph.minimum_spanning_tree(D).max()  # reduced graph
    G = neighbors.radius_neighbors_graph(vecs, R, metric='cosine')

    core = nx.k_core(nx.Graph(G))

    # Capitalize all occurrences of keywords for easy display on the output
    # TODO, make matching case insensitive
    pattern = re.compile(f"\\b({tz.pipe(text_keywords, tz.pluck(0), '|'.join)})\\b")
    nice_pars = nice_pars.apply(
        lambda x: re.sub(pattern, lambda m: m.group().upper(), x))  # TODO add [[]] around our keywords for zettelkasten

    core_nodes = core.nodes
    core_pars = np.array(nice_pars)[core_nodes]
    core_vecs = vecs[core_nodes]

    sil_u, n, lab, sil, p = clust(nx.adjacency_matrix(core), core_vecs, 8)

    layers = nx.onion_layers(core)

    df = pd.DataFrame(
        data=[{"Label": par, "Cluster ID": cid, "Silhouette Score": ss} for par, cid, ss in zip(core_pars, lab, sil)])

    df = df[df["Silhouette Score"] > 0]

    df['Cluster ID'] = df.apply(lambda row: "T" + str(row['Cluster ID']), axis=1)

    # add footer to dataframe so that csv export will be imported by gsheet's tree map plotter correctly
    for cluster_id in df['Cluster ID'].unique():
        df = df.append({"Label": cluster_id, "Cluster ID": name_of_pdf_dir, "Silhouette Score": None},
                       ignore_index=True)
    else:
        df = df.append({"Label": name_of_pdf_dir, "Cluster ID": None, "Silhouette Score": None}, ignore_index=True)

    df.to_csv(args.output_filename, index=False)

    return {
        "text_keywords": text_keywords
    }
# read test data
df_test = pd.read_csv(root / 'test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

# load the graph
G = nx.read_edgelist(root / 'collaboration_network.edgelist',
                     delimiter=' ',
                     nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

# computes structural features for each node
core_number = nx.core_number(G)  # dict that associates node -> core_number
onion_number = nx.onion_layers(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
degree_centrality = nx.degree_centrality(G)
clustering = nx.clustering(G)

# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number and (3) the average degree of its neighbors
X_train_ = np.zeros((n_train, 6))
y_train_ = np.zeros(n_train)
for i, row in df_train.iterrows():
    node = row['authorID']
    X_train_[i, 0] = G.degree(node)
    X_train_[i, 1] = core_number[node]
    X_train_[i, 2] = avg_neighbor_degree[node]
    X_train_[i, 3] = onion_number[node]
    X_train_[i, 4] = degree_centrality[node]