def save_doc2vec_model(doc2vec_model, file_name, directory_path=None): """persist trained doc2vec model :param doc2vec_model: previous trained doc2vec model :param file_name: file name for persisting :param directory_path: where to persist doc2vec model """ file_name = file_name + "-" + time.strftime("%d-%b-%Y-%X") if directory_path is not None: doc2vec_model_path = directory_path if not os.path.exists(doc2vec_model_path + '/doc2vec'): try: os.makedirs(doc2vec_model_path + 'doc2vec') except OSError as e: if e.errno != errno.EEXIST: raise logger.info("save new doc2vec model at: " + doc2vec_model_path + 'doc2vec/') doc2vec_model.save(doc2vec_model_path + 'doc2vec/' + file_name) else: doc2vec_model_path = settings.DATA_DIR + 'doc2vec/models/' logger.info("save new doc2vec model at: " + doc2vec_model_path) doc2vec_model.save(doc2vec_model_path + file_name)
def get_doc_vectors_for_new_documents(doc2vec_model, documents_folder_name, documents_file_path=None): """retrieve document vectors for new documents based on previous trained doc2vec model :param doc2vec_model: trained doc2vec model :param documents_folder_name: name of documents folder :param documents_file_path: file path to the folder :return: english and german document vectors """ logger.info('get doc vector values of unseen documents') if documents_file_path is None: documents_file_path = settings.DATA_DIR + 'crawling_data/' + documents_folder_name + '/' else: documents_file_path = documents_file_path + documents_folder_name + '/' logger.info('get documents from following path: ' + documents_file_path) preprocessed_documents_english, preprocessed_documents_german = preprocess_new_documents(documents_file_path) doc_vectors_english = doc2vec_model.infer_vector(preprocessed_documents_english, alpha=0.025, min_alpha=0.01, steps=1) doc_vectors_german = doc2vec_model.infer_vector(preprocessed_documents_german, alpha=0.025, min_alpha=0.01, steps=1) return doc_vectors_english, doc_vectors_german
def load_existing_model(doc2vec_model_file_path=None, model_file_name=None): if doc2vec_model_file_path is None and model_file_name is not None: logger.info('Loading doc2vec models directly from project structure...') doc2vec_model_file_path = settings.DATA_DIR + 'doc2vec/models/' + model_file_name logger.info('load model from following path: ' + str(doc2vec_model_file_path)) loaded_model = gensim.models.Doc2Vec.load(doc2vec_model_file_path) return loaded_model
def get_doc_vectors_matrix(doc2vec_model): """retrieve document vectors value of given doc2vec model :param doc2vec_model: :return: document vectors matrix """ logger.info('get document vectors matrix of doc2vec model') docvec_vectors = doc2vec_model.docvecs return docvec_vectors
def get_word_vectors_matrix(doc2vec_model): """retrieve word vectors value of given doc2vec model :param doc2vec_model: :return: word vectors matrix """ logger.info('get word vectors matrix of doc2vec model') word_vectors = doc2vec_model.wv.syn0 return word_vectors
def get_doc_similarities_by_new_vector(doc2vec_model, new_vector): """retrieve cosine similarities by new vector (word) :param doc2vec_model: trained doc2vec model :param new_vector: word to infer vector values :return: cosine similarities for given new vector (word) """ logger.info('get document similarities by new vector') similarities = doc2vec_model.docvecs.most_similar([new_vector]) return similarities
def get_doc_similarities_by_document_name(doc2vec_model, document_name): """retrieve cosine similarities of document by document name :param doc2vec_model: trained doc2vec model :param document_name: name of document in doc2vec model :return: cosine similarities """ logger.info('get document similarities') similarities = doc2vec_model.docvecs.most_similar(document_name) return similarities
def crawl_given_hostnames(hostnames, directory_to_save_results): """helper class to crawl given urls :param hostnames: urls to crawl :param directory_to_save_results: where to persist the result """ crawler = Crawler() logger.info('crawl following urls (hostnames): ' + str(hostnames)) crawler.crawl_hostnames(hostnames, directory_to_save_results)
def crawl_list_of_hostnames(urls_list_file_path, directory_to_save_results): """ helper class to crawl list of hostnames :param urls_list_file_path: :param directory_to_save_results: :return: """ crawler = Crawler() logger.info('crawl following list of urls: ' + urls_list_file_path) hostnames = crawler.get_hostnames_from_csv_list(urls_list_file_path) crawler.crawl_hostnames(hostnames, directory_to_save_results)
def preprocess_new_documents(documents_path): """documents which were not included in a previous trained doc2vec model, gets preprocessed for further processing :param documents_path: file path to the newly crawled documents :return: preprocessed and concatenated english and german documents """ logger.info('start processing unseen document') folders_in_directory = glob.glob(documents_path) if not folders_in_directory: raise IOError else: preprocessed_and_concatenated_documents_english = [] preprocessed_and_concatenated_documents_german = [] english_documents_counter = 0 german_documents_counter = 0 for folder_name in folders_in_directory: logger.info('start getting files of folder ' + folder_name) pattern = os.path.join(folder_name, '*.txt') file_names = glob.glob(pattern) if not file_names: raise IOError else: logger.info('start read in files') for file_name in file_names: with open(file_name, 'r') as file: document = file.read() document = gensim.utils.simple_preprocess(document) document_language = preprocessing.detect_language(document) if document_language == 'english': english_documents_counter += 1 preprocessed_document = preprocessing.preprocess_document(document, document_language) preprocessed_and_concatenated_documents_english += preprocessed_document elif document_language == 'german': german_documents_counter += 1 preprocessed_document = preprocessing.preprocess_document(document, document_language) preprocessed_and_concatenated_documents_german += preprocessed_document logger.info('Concatenated and preprocessed ' + str(english_documents_counter) + ' to one english document') logger.info('Concatenated and preprocessed ' + str(german_documents_counter) + ' to one german document') return preprocessed_and_concatenated_documents_english, preprocessed_and_concatenated_documents_german
def create_new_doc2vec_model(documents_file_path=None, save_to_directory=None, single_language_support=False): """helper function to create a new doc2vec model :param documents_file_path: file path to the crawled and stored documents :param save_to_directory: where to save the doc2vec model :return: english and german doc2vec model """ if documents_file_path is not None: documents_file_path = documents_file_path + '*/' logger.info('document file has been set to: ' + str(documents_file_path)) else: documents_file_path = settings.DATA_DIR + 'crawling_data/*/' logger.info('No documets file path has been given, default file path used: ' + str(documents_file_path)) logger.info('Start creating new doc2vec model...') document_corpus_english, document_corpus_german = create_document_corpus_by_language(documents_file_path, single_language_support) doc2vec_model_english = create_doc2vec_model(document_corpus_english) doc2vec_model_german = create_doc2vec_model(document_corpus_german) save_doc2vec_model(doc2vec_model_english, 'doc2vec-model-english', directory_path=save_to_directory) save_doc2vec_model(doc2vec_model_german, 'doc2vec-model-german', directory_path=save_to_directory) return doc2vec_model_english, doc2vec_model_german
def create_doc2vec_model(document_corpus): """doc2vec model gets build for given document corpus :param document_corpus: previous build document corpus consists of multiple labeled documents :return: doc2vec model """ # doc2vec hyperparameters -- inspired by: https://github.com/jhlau/doc2vec vector_size = 300 window_size = 15 min_count = 1 sampling_threshold = 1e-5 negative_size = 5 train_epoch = 100 dm = 0 # 0 = dbow; 1 = dmpv worker_count = 3 # number of parallel processes logger.info('start building Doc2Vec model') model = gensim.models.Doc2Vec(size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, iter=train_epoch) model.build_vocab(document_corpus) logger.info("model's vocubulary length: " + str(len(model.wv.vocab))) logger.info("start to train the model") model.train(document_corpus, total_examples=model.corpus_count, epochs=model.iter) return model
def kmeans_clustering(doc2vec_model, tsne_model, model_language, k=3, new_hostnames=None, save_to_directory=None): """Creates K-Means clustering for given tsne model :param doc2vec_model: data point labels (keys) gets inferred from doc2vec model :param tsne_model: tsne model to apply clustering :param model_language: language of doc2vec model, gets added to the file name :param k: value to control how many clusters (k) should be generated :param new_hostnames: hostnames which where not included in doc2vec model while training (new data) :param save_to_directory: where to store the plot """ logger.info("Start creating K-Means Clustering...") logger.info('Length of the t-sne model = ' + str(len(tsne_model))) data_point_labels = list(doc2vec_model.docvecs.doctags.keys()) if new_hostnames is not None: for hostname in new_hostnames: data_point_labels.append(hostname) logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels))) assert (len(tsne_model) == len(data_point_labels)) assert (k <= len(tsne_model)) k = k random_state = 0 logger.info('K-Means Parameters: k = %s, random_state= %d ' % (k, random_state)) logger.info('Start training K-Means Model...') kmeans = KMeans(n_clusters=k, random_state=random_state).fit(tsne_model) logger.info('K-Means model sucessfully built...start with visualization') # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = tsne_model[:, 0].min() - 1, tsne_model[:, 0].max() + 1 y_min, y_max = tsne_model[:, 1].min() - 1, tsne_model[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) logger.info('start creating color plot...') plt.figure(figsize=(16, 16)) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(tsne_model[:, 0], tsne_model[:, 1], 'k.', markersize=2) # Annotate the data points for i, txt in zip(tsne_model, data_point_labels): plt.annotate(txt, (i[0], i[1]), xytext=(0, -8), textcoords="offset points", va="center", ha="left") logger.info('The centroids are getted plotted as white x...') centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-Means clustering with K=%d over the t-sne reduced data' % k) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) if save_to_directory is None: file_path = settings.DATA_DIR + "experiments/clusterer/kmeans/" else: file_path = save_to_directory file_name = 'kmeans_cluster-' + model_language + '-' + time.strftime( "%d-%b-%Y-%X") + ".png" plt.savefig(file_path + file_name, facecolor="w", dpi=90) logger.info("saved " + file_name + "at " + file_path) plt.show()
def agglomerative_clustering(doc2vec_model, tsne_model, numbers_of_clusters, model_language, new_hostnames=None, save_to_directory=None): """applies agglomerative clustering algorithm to given tsne model :param doc2vec_model: infer documents labels (keys) from doc2vec model :param tsne_model: tsne model to apply algorithm to :param numbers_of_clusters: how many clusters should get build :param model_language: doc2vec model language, gets added to the plot file name :param new_hostnames: hostnames which where not included in doc2vec model while training (new data) :param save_to_directory: where to store the plot Reference: http://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_clustering.html#sphx-glr-auto-examples-cluster-plot-agglomerative-clustering-py """ logger.info("Start creating Agglomerative Cluster...") data_point_labels = list(doc2vec_model.docvecs.doctags.keys()) if new_hostnames is not None: for hostname in new_hostnames: data_point_labels.append(hostname) logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels))) logger.info('Length of the t-sne model = ' + str(len(tsne_model))) assert (len(tsne_model) == len(data_point_labels)) # calculate local connectivity knn_graph = kneighbors_graph(tsne_model, 30, include_self=False) # example: (5, 10, 15, 20, 25, 30) numbers_of_clusters = tuple(numbers_of_clusters) for connectivity in (None, knn_graph): for n_clusters in numbers_of_clusters: plt.figure(figsize=(40, 15)) for index, linkage in enumerate(('average', 'complete', 'ward')): plt.subplot(1, 3, index + 1) model = AgglomerativeClustering(linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) t0 = time.time() model.fit(tsne_model) elapsed_time = time.time() - t0 plt.scatter(tsne_model[:, 0], tsne_model[:, 1], c=model.labels_, cmap=plt.cm.spectral) # Annotate the data points for i, txt in zip(tsne_model, data_point_labels): plt.annotate(txt, (i[0], i[1]), xytext=(0, -8), textcoords="offset points", va="center", ha="left") plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time), fontdict=dict(verticalalignment='top')) plt.axis('equal') plt.axis('off') plt.subplots_adjust(bottom=0, top=.89, wspace=0, left=0, right=1) plt.suptitle('n_cluster=%i, connectivity=%r' % (n_clusters, connectivity is not None), size=17) if save_to_directory is None: file_path = settings.DATA_DIR + "experiments/clusterer/agglomerative_clustering/" else: file_path = save_to_directory file_name = 'agglomerative_clustering-' + model_language + '-' + time.strftime( "%d-%b-%Y-%X") + ".png" plt.savefig(file_path + file_name, facecolor="w", dpi=90) logger.info("saved " + file_name + " at " + file_path) plt.show()
def kmedoid_clustering(doc2vec_model, tsne_model, start_medoids, new_hostnames=None): """creates K-Medoid clustering for given tsne model :param doc2vec_model: doc2vec model to infer data point labels (keys) :param tsne_model: tsne model to apply clustering :param start_medoids: medoids which be used as startin point :param new_hostnames: hostnames which where not included in doc2vec model while training (new data) """ logger.info("Start creating K-Medoid Cluster...") data_point_labels = list(doc2vec_model.docvecs.doctags.keys()) if new_hostnames is not None: for hostname in new_hostnames: data_point_labels.append(hostname) logger.info('Amount of Datapoints Labels = ' + str(len(data_point_labels))) logger.info('Length of the t-sne model = ' + str(len(tsne_model))) assert (len(tsne_model) == len(data_point_labels)) # Example: start_medoids = [0, 5, 10, 15, 20] start_medoids = start_medoids logger.info('Number of Medoids = %s' % len(start_medoids)) logger.info('Given Medoids = %s' % str(start_medoids)) logger.info('Start creating K-Medoid Model...') tolerance = 0.2 kmedoids_instance = kmedoids(tsne_model, start_medoids, tolerance); (ticks, result) = timedcall(kmedoids_instance.process) clusters = kmedoids_instance.get_clusters(); medoids = kmedoids_instance.get_medoids(); print("Sample: ", "\t\tExecution time: ", ticks, "\n"); cluster_visualizer = ClusterVisualizer(1, data=tsne_model, labels=data_point_labels); cluster_visualizer.append_clusters(clusters, tsne_model, 0); cluster_visualizer.append_cluster(medoids, marker='*', markersize=12, color='red') cluster_visualizer.show(k=len(start_medoids), tolerance=tolerance);
def main(): # Some examples... ''' # Create 2D t-SNE Model doc2vec_model = doc2vec.load_existing_model(model_file_name='doc2vec_single_language_full_model_german_18_Feb_2018_22_31_27') doc2vec_vector_matrix = doc2vec.get_doc_vectors_matrix(doc2vec_model) create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'single_language_full-model-doc2vec-model-german') # Example to to create tsne model with new data logger.info('Start building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X")) doc2vec_model = doc2vec.load_existing_model(model_file_name='standard-models/doc2vec_model_german_17_Feb_2018_02_14_04') doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents(doc2vec_model, new_documents=['upkbs.ch', 'curaneo.ch', 'bscyb.ch', 'scltigers.ch', 'graubuenden.ch'], model_language='german', documents_file_path='/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/') create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'full-doc2vec-model-new-data-german') logger.info('Finished building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X")) ''' # Live-Demo pt. 1 logger.info('LIVE-DEMO Start building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X")) doc2vec_model = doc2vec.load_existing_model( model_file_name= 'doc2vec_single_language_full_model_german_18_Feb_2018_22_31_27') doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents( doc2vec_model, new_documents=['triaplus.ch', 'fcaarau.ch'], model_language='german', documents_file_path= '/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/' ) create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'live-demo-german') logger.info('Finished building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X")) # Live-Demo pt.2 logger.info('LIVE-DEMO Start building 2nd tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X")) doc2vec_model = doc2vec.load_existing_model( model_file_name= 'doc2vec_single_language_full_model_english_18_Feb_2018_22_31_27') doc2vec_vector_matrix = doc2vec.create_doc_vector_matrix_for_new_documents( doc2vec_model, new_documents=['hostelscentral.com'], model_language='english', documents_file_path= '/home/sandro/vm1/OTA_Clusterer/data/experiments/crawling_data_experiments/' ) create_and_save_2d_tsne_model(doc2vec_vector_matrix, 'live-demo-english') logger.info('Finished building tsne model with new data at: ' + time.strftime("%d-%b-%Y-%X"))
def create_document_corpus_by_language(documents_path, single_language_support=False): """ reads in crawled documents and create a doc2vec document corpus previous crawled documents gets preprocessed and a doc2vec document corpus gets build separated by language (german and english) :param documents_path: file path of the crawled documents :param single_language_support: just support one language per hostname :return: german and english document corpus """ logger.info('start creating document corpus by language') folders_in_directory = glob.glob(documents_path) if not folders_in_directory: raise IOError else: preprocessed_documents_corpus_english = [] preprocessed_documents_corpus_german = [] for folder_name in folders_in_directory: logger.info('start getting files of folder ' + folder_name) pattern = os.path.join(folder_name, '*.txt') file_names = glob.glob(pattern) if not file_names: raise IOError else: if single_language_support is True: logger.info('single language support is enabled') preprocessed_documents_by_directory_english = [] preprocessed_documents_by_directory_german = [] logger.info('start read in files') for file_name in file_names: with open(file_name, 'r') as file: document = file.read() document = gensim.utils.simple_preprocess(document) document_language = preprocessing.detect_language(document) if document_language == 'english' or document_language == 'german': preprocessed_document = preprocessing.preprocess_document(document, document_language) tagged_document_name = remove_file_path_from_folder_name(folder_name) tagged_document = gensim.models.doc2vec.TaggedDocument(preprocessed_document, ["{}".format(tagged_document_name)]) if document_language == 'english': if single_language_support is True: preprocessed_documents_by_directory_english.append(tagged_document) else: preprocessed_documents_corpus_english.append(tagged_document) elif document_language == 'german': if single_language_support is True: preprocessed_documents_by_directory_german.append(tagged_document) else: preprocessed_documents_corpus_german.append(tagged_document) if single_language_support is True: number_of_english_documents = len(preprocessed_documents_by_directory_english) number_of_german_documents = len(preprocessed_documents_by_directory_german) if number_of_english_documents > number_of_german_documents: for document in preprocessed_documents_by_directory_english: preprocessed_documents_corpus_english.append(document) logger.info( 'added ' + str(number_of_english_documents) + ' documents from ' + folder_name + ' to english corpus') elif number_of_german_documents > number_of_english_documents: for document in preprocessed_documents_by_directory_german: preprocessed_documents_corpus_german.append(document) logger.info( 'added ' + str(number_of_german_documents) + ' documents from ' + folder_name + ' to german corpus') elif number_of_english_documents == number_of_german_documents: logger.info('added documents of ' + folder_name + ' to both corpus') for document in preprocessed_documents_by_directory_english: preprocessed_documents_corpus_english.append(document) for document in preprocessed_documents_by_directory_german: preprocessed_documents_corpus_german.append(document) logger.info( 'Added ' + str(len(preprocessed_documents_corpus_english)) + ' documents to the english document corpus') logger.info( 'Added ' + str(len(preprocessed_documents_corpus_german)) + ' documents to the german document corpus') return preprocessed_documents_corpus_english, preprocessed_documents_corpus_german
nargs='+', help='Space seperated numbers of clusters: 5 10 15 20') agglomerative_clustering_cli.add_argument( '--agglomerative_clustering', help='agglomerative clustering algorithm to given tsne model (required ' 'params: ' '-cluster_nr, -load_doc2vec_model, -load_tsne_model, -model_language, ' '-clustering_dir, -new_hostnames (optional))', action='store_true', dest='agglomerative_clustering') args = parser.parse_args() if args.crawl: logger.info('Hostnames to crawl via CLI = ' + str(args.hostnames)) logger.info('Store crawled data at: ' + args.crawled_dir) crawler.crawl_given_hostnames(args.hostnames, args.crawled_dir) elif args.crawl_list: logger.info('Crawl list of hostnames via CLI from: ' + args.urls_list) logger.info('Store crawled data at: ' + args.crawled_dir) crawler.crawl_list_of_hostnames(args.urls_list, args.crawled_dir) elif args.create_doc2vec_model: logger.info('Create doc2vec model via CLI from following data: ' + args.crawled_dir + ' and store data at ' + args.models_dir) doc2vec.create_new_doc2vec_model( documents_file_path=args.crawled_dir, save_to_directory=args.models_dir,