def main():
    parser = argparse.ArgumentParser(
        description='plots cluster sizes, sorted descending')
    parser.add_argument('--cluster-labels',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 cluster labels file',
                        required=True)
    parser.add_argument('--img',
                        type=argparse.FileType('w'),
                        help='path of output img file',
                        required=True)

    args = parser.parse_args()
    input_cluster_labels_path = args.cluster_labels.name
    output_img_path = args.img.name

    logger.info('loading cluster labels')
    cluster_labels = load_communities(input_cluster_labels_path)

    labels, counts = np.unique(cluster_labels, return_counts=True)
    counts[::-1].sort()

    logger.info('plotting sorted cluster sizes')
    xlabel = 'Cluster'
    ylabel = 'Anzahl Dokumente'
    scatter_plot(counts, output_img_path, xlabel, ylabel, False, 3)
def main():
    parser = argparse.ArgumentParser(description='plots given 2d-transformed documents represented by their topic distributions (optional: with clusters)')
    parser.add_argument('--documents-2d', type=argparse.FileType('r'), help='path to input document-2d-data (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input cluster labels .json.bz2 file')
    parser.add_argument('--img-file', type=argparse.FileType('w'), help='path to output im file', required=True)

    args = parser.parse_args()
    input_documents_2d_path = args.documents_2d.name
    input_cluster_labels_path = args.cluster_labels.name if args.cluster_labels else None
    output_img_path = args.img_file.name

    logger.info('loading 2d-transformed document topics')
    documents_2d = load_document_topics(input_documents_2d_path)     
    
    if input_cluster_labels_path:
        logger.info('loading cluster labels')
        cluster_labels = load_communities(input_cluster_labels_path)
        cluster_labels = np.array(cluster_labels)
    else:
        logger.info('no cluster labels given')
        cluster_labels = None
        
    logger.info('plotting 2d-documents')
    size = 1
    scatter_2d_plot(documents_2d[:,0], documents_2d[:,1], output_img_path, labels=cluster_labels, rasterized=True, size=size)
def main():
    parser = argparse.ArgumentParser(
        description=
        'maps a given partitioning (clustering/communities) file with document labels and a given metadata file with document titles to a doctitle->partitionlabel file'
    )
    parser.add_argument(
        '--partitions',
        type=argparse.FileType('r'),
        help=
        'path to input .json.bz2 partitioning file (communities: JSON-dict / clustering: JSON-list)',
        required=True)
    parser.add_argument('--titles',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 titles file',
                        required=True)
    parser.add_argument(
        '--title-partitions',
        type=argparse.FileType('w'),
        help='path to output doctitle->partitionlabel .json file',
        required=True)

    args = parser.parse_args()
    input_partititions_path = args.partitions.name
    input_titles_path = args.titles.name
    output_title_partitions_path = args.title_partitions.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_partititions_path': input_partititions_path,
            'input_titles_path': input_titles_path,
            'output_title_partitions_path': output_title_partitions_path
        })))

    # lade Titel, Partitionierung
    titles = load_titles(input_titles_path)
    partitions = load_communities(input_partititions_path)

    # erzeuge Titel->Partitionslabel-Mapping
    if isinstance(partitions, dict):
        # bei Graph-Communities ist Partitionierung dict: bestimme Dok-ID aus Graph-Label des Dokumentes (wie z.B. "d123"), bestimme zug. Dok-Titel
        title_partitions = {
            titles[doc_id[1:]]: comm_label
            for doc_id, comm_label in partitions.items()
        }
    else:
        # bei Clustering ist Partitionierung list: betrachte Index jedes Clusterlabels als Dok-ID, bestimme zug. Dok-Titel
        title_partitions = {
            titles[str(doc_id)]: comm_label
            for doc_id, comm_label in enumerate(partitions) if comm_label >= 0
        }
    logger.info('generated {} title_partitions'.format(len(title_partitions)))
    logger.debug('title_partitions \n{}'.format(title_partitions))

    # speichere Titel->Partitionslabel-Mapping
    logger.info('saving title communities')
    save_data_to_json(title_partitions, output_title_partitions_path)
def main():
    parser = argparse.ArgumentParser(description='calculated various centrality-related stats (only the giant component of the graph considered!')
    parser.add_argument('--coauth-graph', type=argparse.FileType('r'), help='path to output pickled, gzipped graph file', required=True)
    parser.add_argument('--communities', type=argparse.FileType('r'), help='path to input .json.bz2 communities file', required=True)
    parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True)
    parser.add_argument('--K', type=int, help='number of considered, equaldistand communites 0,floor(1*(N-1)/K),...,N-1', required=True)
    parser.add_argument('--J', type=int, help='maxiumum number of highest considered nodes per community', required=True)
    
    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    input_communities_path = args.communities.name
    input_titles_path = args.titles.name
    K = args.K
    J = args.J
    
    logger.info('running with:\n{}'.format(pformat({'input_coauth_graph_path':input_coauth_graph_path, 'input_communities_path':input_communities_path, 'input_titles_path':input_titles_path, 'K':K, 'J':J})))
    
    logger.info('loading graph from {}'.format(input_coauth_graph_path))
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    logger.info('using largest connected component of largest size instead actual graph')
    coauth_graph = coauth_graph.components().giant()
    log_igraph(coauth_graph)
    
    communities = load_communities(input_communities_path)
    titles = load_titles(input_titles_path)
    
    logger.info('creating vertex clustering of community labels')
    node_labels = [communities[name] for name in coauth_graph.vs['name']]
    community_structure = VertexClustering(coauth_graph, membership=node_labels)
    logger.debug('created vertex clustering {}'.format(community_structure))
        
    community_sizes = list(enumerate(community_structure.sizes()))
    community_sizes.sort(key=lambda t:t[1], reverse=True)
    logger.debug('community sizes, sorted descending\n{}'.format(community_sizes))
        
    logger.info('filtering to communities of at least {} nodes'.format(J))
    community_sizes = [(commid,size) for commid,size in community_sizes if size >= J]
    logger.info('filtered to {} communities'.format(len(community_sizes)))
        
    N = len(community_sizes)
    logger.info('calculating considered communities number of communites N={}, considering K={} equidistant communities'.format(N, K))
    community_indices = [math.floor(k*(N-1)/(K-1)) for k in range(0,K)]
    logger.info('considering indices {}'.format(community_indices))
    considered_communities = [community_sizes[i] for i in community_indices]
    logger.info('considering communities (id,size): {}'.format(considered_communities))
      
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, degree)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, strength)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, betweenness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, weighted_betweenness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, closeness)
    find_max_nodes_per_community(community_structure, considered_communities, titles, J, weighted_closeness)
def main():
    parser = argparse.ArgumentParser(
        description=
        'compares two clusterings/community structures by computing the normalized mutual information score of both documenttitle->clusterlabel mappings (comparison bases of intersection based on equal documenttitles)'
    )
    parser.add_argument(
        '--clusterings',
        nargs=2,
        type=argparse.FileType('r'),
        metavar=('CLUS1', 'CLUS2'),
        help='path to two titleclsuterings files (.json/.json.bz2)',
        required=True)

    args = parser.parse_args()
    input_clusterings_paths = (args.clusterings[0].name,
                               args.clusterings[1].name)

    clustering1 = load_communities(input_clusterings_paths[0])
    clustering2 = load_communities(input_clusterings_paths[1])

    logger.info('intersecting clusterings by document titles')
    intersect_titles = sorted(clustering1.keys() & clustering2.keys())
    logger.debug('intersect titles \n{}'.format(intersect_titles))
    logger.info('number of intersect titles {}'.format(len(intersect_titles)))

    intsect_labels1 = [clustering1[title] for title in intersect_titles]
    logger.debug('labels of intersect titles in clustering 1 \n{}'.format(
        intsect_labels1))
    intsect_labels2 = [clustering2[title] for title in intersect_titles]
    logger.debug('labels of intersect titles in clustering 2 \n{}'.format(
        intsect_labels2))

    intsect_labels1 = np.array(intsect_labels1)
    intsect_labels2 = np.array(intsect_labels2)

    score = normalized_mutual_info_score(intsect_labels1, intsect_labels2)
    logger.info('normalized-mutual-info: {}'.format(score))
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description='creates a file of clusterings: clusters are sorted descending by size, cluster elements are sorted by distance to cluster centroid')    
    parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True)
    parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True)
    parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True)  
    parser.add_argument('--centrality-data', type=argparse.FileType('w'), help='path to output .json cluster->centrality_data file', required=True)
    parser.add_argument('--max-docs-per-clus', type=int, help='maxiumum number of highest considered nodes per cluster', required=True)
    parser.add_argument('--metric', help='calced dissimilarity to centroids (muse be allowd by cdist of scipy)', required=True)
    
    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    max_docs_per_clus = args.max_docs_per_clus
    metric = args.metric
    
    logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'input_cluster_labels_path':input_cluster_labels_path, 'input_titles_path':input_titles_path, 'output_centrality_data_path':output_centrality_data_path, 'max_docs_per_clus':max_docs_per_clus, 'metric':metric})))
        
    document_topics = load_document_topics(input_document_topics_path)
    cluster_labels = load_communities(input_cluster_labels_path)
    document_titles = load_titles(input_titles_path)
        
    clusters = get_clusters_from_labels(cluster_labels)    
    logger.info('computing {}-centralities of {} documents in {} communities'.format(metric, len(cluster_labels), len(clusters)))
    centrality_data = {}
    for clus_id, cluster in enumerate(clusters):
        max_doc_ids, centralities = get_top_central_cluster_docs(cluster, document_topics, max_docs_per_clus, metric)
        logger.debug('max doc ids {}'.format(max_doc_ids))
        logger.debug('max doc centralities {}'.format(centralities))
        max_doc_titles = get_document_titles(max_doc_ids, document_titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_cluster = {
            'size': len(cluster),
            'titles': max_doc_titles, 
            'centralities': centralities
        }
        centrality_data[clus_id] = centrality_data_of_cluster
    
    logger.info('saving cluster centrality data (titles,centralities) of {} clusters'.format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)
Пример #7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'plots the descending purities of each cluster (purity: highest cosine similarity to a [0,...,0,1,0,...,0] topic vector)'
    )
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--cluster-labels',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 clustering file',
                        required=True)
    parser.add_argument('--plot',
                        type=argparse.FileType('w'),
                        help='path to output purity plot file',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    output_plot_path = args.plot.name

    document_topics = load_document_topics(input_document_topics_path)
    cluster_labels = load_communities(input_cluster_labels_path)

    clusters = get_clusters_from_labels(cluster_labels)
    logger.info('calculating purity of {} clusters'.format(len(clusters)))
    cluster_purities = [
        get_cluster_purity(cluster, document_topics) for cluster in clusters
    ]
    logger.info('calculated {} purity values'.format(len(cluster_purities)))

    cluster_purities = np.array(cluster_purities)
    cluster_purities[::-1].sort()

    xlabel = 'Cluster'
    ylabel = 'Reinheit'
    logger.info('plotting purities to {}'.format(output_plot_path))
    scatter_plot(cluster_purities, output_plot_path, xlabel, ylabel)
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculates silhouette coefficient of a given clustering and its document-topic-matrix'
    )
    parser.add_argument('--document-topics',
                        type=argparse.FileType('r'),
                        help='path to input document-topic-file (.npz)',
                        required=True)
    parser.add_argument('--cluster-labels',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 cluster labels file',
                        required=True)
    parser.add_argument('--metric',
                        choices=_VALID_METRICS,
                        help='distance function to use',
                        required=True)

    args = parser.parse_args()
    input_document_topics_path = args.document_topics.name
    input_cluster_labels_path = args.cluster_labels.name
    metric = args.metric

    logger.info('loading document topics')
    document_topics = load_document_topics(input_document_topics_path)
    logger.info('loading cluster labels')
    cluster_labels = load_communities(input_cluster_labels_path)
    logger.debug(cluster_labels)

    logger.info('calclating unsupervised evaluation metrics')
    sil_score = silhouette_score(document_topics,
                                 cluster_labels,
                                 metric=metric)  # groß=gut
    logger.info('{} silhouette coefficient: {}'.format(metric, sil_score))
    ch_score = calinski_harabaz_score(
        document_topics, cluster_labels
    )  # between-scatter durch within-scatter inkl. Straftermen -> groß=gut
    logger.info('calinski harabaz score: {}'.format(ch_score))
Пример #9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'calculated the most central documents of each community and writes their centrality data (titles,centralities) to a JSON file (exactly min(#nodes of community,J) titles are save per community)'
    )
    parser.add_argument('--coauth-graph',
                        type=argparse.FileType('r'),
                        help='path to output pickled, gzipped graph file',
                        required=True)
    parser.add_argument('--communities',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 communities file',
                        required=True)
    parser.add_argument('--titles',
                        type=argparse.FileType('r'),
                        help='path to input .json.bz2 titles file',
                        required=True)
    parser.add_argument(
        '--centrality-data',
        type=argparse.FileType('w'),
        help='path to output .json community->centrality_data file',
        required=True)
    centrality_measures = {
        'degree': degree,
        'strength': strength,
        'betweenness': betweenness,
        'closeness': closeness,
        'weighted_betweenness': weighted_betweenness,
        'weighted_closeness': weighted_closeness
    }
    parser.add_argument('--centrality-measure',
                        choices=centrality_measures,
                        help='centrality measure',
                        required=True)
    parser.add_argument(
        '--max-docs-per-comm',
        type=int,
        help='maxiumum number of highest considered nodes per community',
        required=True)

    args = parser.parse_args()
    input_coauth_graph_path = args.coauth_graph.name
    input_communities_path = args.communities.name
    input_titles_path = args.titles.name
    output_centrality_data_path = args.centrality_data.name
    centrality_measure = args.centrality_measure
    max_docs_per_comm = args.max_docs_per_comm

    logger.info('running with:\n{}'.format(
        pformat({
            'input_coauth_graph_path': input_coauth_graph_path,
            'input_communities_path': input_communities_path,
            'input_titles_path': input_titles_path,
            'output_centrality_data_path': output_centrality_data_path,
            'centrality_measure': centrality_measure,
            'max_docs_per_comm': max_docs_per_comm
        })))

    logger.info('loading graph from {}'.format(input_coauth_graph_path))
    coauth_graph = Graph.Read_Picklez(input_coauth_graph_path)
    log_igraph(coauth_graph)

    communities = load_communities(input_communities_path)
    titles = load_titles(input_titles_path)

    # entferne Knoten, die nicht in gespeicherter Communitystruktur auftauchen (z.B. weil nicht in Riesencommunity sind)
    logger.info('removing nodes of graph without community labels')
    node_names = coauth_graph.vs['name']
    node_names_of_communities = communities.keys()
    node_names_not_in_communities = set(node_names) - set(
        node_names_of_communities)
    coauth_graph.delete_vertices(node_names_not_in_communities)
    logger.info('graph stats after removing')
    log_igraph(coauth_graph)

    logger.info('creating vertex clustering of community labels')
    node_labels = [communities[name] for name in coauth_graph.vs['name']]
    community_structure = VertexClustering(coauth_graph,
                                           membership=node_labels)
    logger.debug('created vertex clustering {}'.format(community_structure))

    logger.info(
        'computing {}-centralities of {} documents in {} communities'.format(
            centrality_measure, community_structure.n,
            len(community_structure)))
    centrality_function = centrality_measures[centrality_measure]
    centrality_data = {}
    for comm_id in range(len(community_structure)):
        comm_subgraph = community_structure.subgraph(comm_id)
        max_node_names_centralities = get_top_nodes_of_communities(
            comm_subgraph, max_docs_per_comm, centrality_function)
        logger.debug(
            'max_node_names_weights {}'.format(max_node_names_centralities))
        max_node_names, centralities = zip(*max_node_names_centralities)
        max_doc_titles = get_document_titles_of_node_names(
            max_node_names, titles)
        logger.debug('max titles: {}'.format(max_doc_titles))
        centrality_data_of_community = {
            'size': comm_subgraph.vcount(),
            'titles': max_doc_titles,
            'centralities': centralities
        }
        centrality_data[comm_id] = centrality_data_of_community

    logger.info(
        'saving community centrality data (titles,centralities) of {} communities'
        .format(len(centrality_data)))
    save_data_to_json(centrality_data, output_centrality_data_path)