def main(): parser = argparse.ArgumentParser( description= 'maps a given high-dimensional documents to 2d document representations with t-sne' ) parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--documents-2d', type=argparse.FileType('w'), help='path to output document-2d-data (.npz)', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name output_documents_2d_path = args.documents_2d.name document_topics = load_document_topics(input_document_topics_path) #model = decomposition.PCA(n_components=2) model = TSNE(n_components=2, verbose=1, perplexity=100, n_iter=1000) logger.info('running 2d-transformation with model {}'.format(model)) documents_2d = model.fit_transform(document_topics) logger.debug('2d-transformation res\n{}'.format(documents_2d)) logger.info('saving 2d-documents') save_npz(output_documents_2d_path, documents_2d)
def main(): parser = argparse.ArgumentParser(description='plots given 2d-transformed documents represented by their topic distributions (optional: with clusters)') parser.add_argument('--documents-2d', type=argparse.FileType('r'), help='path to input document-2d-data (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input cluster labels .json.bz2 file') parser.add_argument('--img-file', type=argparse.FileType('w'), help='path to output im file', required=True) args = parser.parse_args() input_documents_2d_path = args.documents_2d.name input_cluster_labels_path = args.cluster_labels.name if args.cluster_labels else None output_img_path = args.img_file.name logger.info('loading 2d-transformed document topics') documents_2d = load_document_topics(input_documents_2d_path) if input_cluster_labels_path: logger.info('loading cluster labels') cluster_labels = load_communities(input_cluster_labels_path) cluster_labels = np.array(cluster_labels) else: logger.info('no cluster labels given') cluster_labels = None logger.info('plotting 2d-documents') size = 1 scatter_2d_plot(documents_2d[:,0], documents_2d[:,1], output_img_path, labels=cluster_labels, rasterized=True, size=size)
def main(): parser = argparse.ArgumentParser( description= 'plots 1. average probalities per topic 3. cdf of these probabilities') parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--topic-avg-probs', type=argparse.FileType('w'), help='path to output avg prop plot file', required=True) parser.add_argument('--topic-avg-probs-cdf', type=argparse.FileType('w'), help='path to output avg prob cdf plot file', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name output_topic_avg_probs_path = args.topic_avg_probs.name output_topic_avg_probs_cdf_path = args.topic_avg_probs_cdf.name document_topics = load_document_topics(input_document_topics_path) logger.info('calculating average probability per topic') average_topic_props = np.average(document_topics, axis=0) logger.info('shape of average res {}'.format(average_topic_props.shape)) average_topic_props[::-1].sort() logger.info('sum over averages {}'.format(average_topic_props.sum())) logger.info('plotting average topic probabilites') xlabel = 'Topic' ylabel = 'Ø Anteil' scatter_plot(average_topic_props, output_topic_avg_probs_path, xlabel, ylabel) average_topic_props_cdf = np.cumsum(average_topic_props) logger.info( 'plotting average topic probabilites cumulative distribution function') xlabel = 'Topic' ylabel = 'Ø Anteil (CDF)' scatter_plot(average_topic_props_cdf, output_topic_avg_probs_cdf_path, xlabel, ylabel)
def main(): parser = argparse.ArgumentParser(description='creates a file of clusterings: clusters are sorted descending by size, cluster elements are sorted by distance to cluster centroid') parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True) parser.add_argument('--titles', type=argparse.FileType('r'), help='path to input .json.bz2 titles file', required=True) parser.add_argument('--centrality-data', type=argparse.FileType('w'), help='path to output .json cluster->centrality_data file', required=True) parser.add_argument('--max-docs-per-clus', type=int, help='maxiumum number of highest considered nodes per cluster', required=True) parser.add_argument('--metric', help='calced dissimilarity to centroids (muse be allowd by cdist of scipy)', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name input_cluster_labels_path = args.cluster_labels.name input_titles_path = args.titles.name output_centrality_data_path = args.centrality_data.name max_docs_per_clus = args.max_docs_per_clus metric = args.metric logger.info('running with:\n{}'.format(pformat({'input_document_topics_path':input_document_topics_path, 'input_cluster_labels_path':input_cluster_labels_path, 'input_titles_path':input_titles_path, 'output_centrality_data_path':output_centrality_data_path, 'max_docs_per_clus':max_docs_per_clus, 'metric':metric}))) document_topics = load_document_topics(input_document_topics_path) cluster_labels = load_communities(input_cluster_labels_path) document_titles = load_titles(input_titles_path) clusters = get_clusters_from_labels(cluster_labels) logger.info('computing {}-centralities of {} documents in {} communities'.format(metric, len(cluster_labels), len(clusters))) centrality_data = {} for clus_id, cluster in enumerate(clusters): max_doc_ids, centralities = get_top_central_cluster_docs(cluster, document_topics, max_docs_per_clus, metric) logger.debug('max doc ids {}'.format(max_doc_ids)) logger.debug('max doc centralities {}'.format(centralities)) max_doc_titles = get_document_titles(max_doc_ids, document_titles) logger.debug('max titles: {}'.format(max_doc_titles)) centrality_data_of_cluster = { 'size': len(cluster), 'titles': max_doc_titles, 'centralities': centralities } centrality_data[clus_id] = centrality_data_of_cluster logger.info('saving cluster centrality data (titles,centralities) of {} clusters'.format(len(centrality_data))) save_data_to_json(centrality_data, output_centrality_data_path)
def main(): parser = argparse.ArgumentParser( description= 'plots the descending purities of each cluster (purity: highest cosine similarity to a [0,...,0,1,0,...,0] topic vector)' ) parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 clustering file', required=True) parser.add_argument('--plot', type=argparse.FileType('w'), help='path to output purity plot file', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name input_cluster_labels_path = args.cluster_labels.name output_plot_path = args.plot.name document_topics = load_document_topics(input_document_topics_path) cluster_labels = load_communities(input_cluster_labels_path) clusters = get_clusters_from_labels(cluster_labels) logger.info('calculating purity of {} clusters'.format(len(clusters))) cluster_purities = [ get_cluster_purity(cluster, document_topics) for cluster in clusters ] logger.info('calculated {} purity values'.format(len(cluster_purities))) cluster_purities = np.array(cluster_purities) cluster_purities[::-1].sort() xlabel = 'Cluster' ylabel = 'Reinheit' logger.info('plotting purities to {}'.format(output_plot_path)) scatter_plot(cluster_purities, output_plot_path, xlabel, ylabel)
def main(): parser = argparse.ArgumentParser( description= 'removes outliers of documents by a given outlier labeling file') parser.add_argument('--documents', type=argparse.FileType('r'), help='path to input documents file (.npz)', required=True) parser.add_argument('--outlier-scores', type=argparse.FileType('r'), help='path to input JSON outlier scores file', required=True) parser.add_argument('--filtered-documents', type=argparse.FileType('w'), help='path to output filtered documents file (.npz)', required=True) parser.add_argument('--contamination', type=float, help='relative amount of most noisy samples to remove', required=True) args = parser.parse_args() input_document_path = args.documents.name input_outlier_scores_path = args.outlier_scores.name output_filtered_documents_path = args.filtered_documents.name contamination = args.contamination documents = load_document_topics(input_document_path) outlier_scores = load_cluster_labels(input_outlier_scores_path) logger.info('filtering documents of shape {} with contamination {}'.format( documents.shape, contamination)) filtered_documents = get_filtered_documents(documents, outlier_scores, contamination) logger.info('shape of filtered documents {}'.format( filtered_documents.shape)) logger.info('saving filtered documents to {}'.format( output_filtered_documents_path)) save_npz(output_filtered_documents_path, filtered_documents)
def main(): parser = argparse.ArgumentParser(description='calculates local outlier factors of given documents') parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--outlier-scores', type=argparse.FileType('w'), help='path to output JSON outlier scores file (.json)', required=True) parser.add_argument('--k-min', type=int, help='minimum number of considered neighbors per sample', required=True) parser.add_argument('--k-max', type=int, help='maximum number of considered neighbors per sample', required=True) parser.add_argument('--metric', help='distance metric of outlier detection', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name output_outlier_scores_path = args.outlier_scores.name k_min, k_max = args.k_min, args.k_max metric = args.metric document_topics = load_document_topics(input_document_topics_path) outlier_scores = calc_max_lof_of_bounds(document_topics, metric, k_min, k_max) logger.info('calculated {} LOF-scores'.format(len(outlier_scores))) logger.debug('scores \n{}'.format(outlier_scores)) logger.info('writing scores to {}'.format(output_outlier_scores_path)) with open(output_outlier_scores_path, 'w') as output_outlier_scores_file: json.dump(outlier_scores.tolist(), output_outlier_scores_file, indent=1)
def main(): parser = argparse.ArgumentParser( description= 'calculates silhouette coefficient of a given clustering and its document-topic-matrix' ) parser.add_argument('--document-topics', type=argparse.FileType('r'), help='path to input document-topic-file (.npz)', required=True) parser.add_argument('--cluster-labels', type=argparse.FileType('r'), help='path to input .json.bz2 cluster labels file', required=True) parser.add_argument('--metric', choices=_VALID_METRICS, help='distance function to use', required=True) args = parser.parse_args() input_document_topics_path = args.document_topics.name input_cluster_labels_path = args.cluster_labels.name metric = args.metric logger.info('loading document topics') document_topics = load_document_topics(input_document_topics_path) logger.info('loading cluster labels') cluster_labels = load_communities(input_cluster_labels_path) logger.debug(cluster_labels) logger.info('calclating unsupervised evaluation metrics') sil_score = silhouette_score(document_topics, cluster_labels, metric=metric) # groß=gut logger.info('{} silhouette coefficient: {}'.format(metric, sil_score)) ch_score = calinski_harabaz_score( document_topics, cluster_labels ) # between-scatter durch within-scatter inkl. Straftermen -> groß=gut logger.info('calinski harabaz score: {}'.format(ch_score))