word2vec.get_similar_words(u"France"))) logger.info(u"Words similar to phone ({}) : {}".format( word2vec.get_word_count(u"phone"), word2vec.get_similar_words(u"phone"))) logger.info(u"Words similar to ask ({}) : {}".format( word2vec.get_word_count(u"ask"), word2vec.get_similar_words(u"ask"))) logger.info(u"Words similar to september ({}) : {}".format( word2vec.get_word_count(u"september"), word2vec.get_similar_words(u"september"))) logger.info(u"Words similar to blue ({}) : {}".format( word2vec.get_word_count(u"blue"), word2vec.get_similar_words(u"blue"))) # Test relatedness relatedness, relatedness_words = Metrics.relatedness( wordsim353, word2vec) print(u"Relatedness : {}, on {} words".format(relatedness, relatedness_words)) # If we want a figure if args.image is not None: # Order by word count word_counters = list() word_counts = word2vec.get_word_counts() for word_text in word_counts.keys(): word_counters.append((word_text, word_counts[word_text])) # end for word_counters = sorted(word_counters, key=lambda tup: tup[1], reverse=True)
spectral_radius=rc_spectral_radius, w_sparsity=rc_w_sparsity) # Add examples for author_index, author_id in enumerate((args.author1, args.author2)): author_path = os.path.join(args.dataset, "total", author_id) for file_index in training_set_indexes: file_path = os.path.join(author_path, str(file_index) + ".txt") classifier.train(io.open(file_path, 'r').read(), author_index) # end for # end for # Finalize model training classifier.finalize(verbose=True) # Init test epoch test_set = list() # Get text for author_index, author_id in enumerate((args.author1, args.author2)): author_path = os.path.join(args.dataset, "total", str(author_id)) for file_index in test_set_indexes: file_path = os.path.join(author_path, str(file_index) + ".txt") test_set.append((io.open(file_path, 'r').read(), author_index)) # end for # end for # Success rate success_rate = Metrics.success_rate(classifier, test_set, verbose=True, debug=True) print(u"Success rate : {}".format(success_rate)) # end if
# For each distance measure for distance_measure in ['euclidian', 'cosine', 'cosine_abs']: print(u"#" * 100) print(u"# " + distance_measure) print(u"#" * 100) # Similarities Visualization.similar_words( [u"he", u"computer", u"million", u"Toronto", u"France", u"phone", u"ask", u"september", u"blue", u"king", u"man", u"woman"], word2vec, distance_measure=distance_measure, limit=args.n_similar_words) # Word computing Visualization.king_man_woman(word2vec, u"king", u"man", u"woman", distance_measure=distance_measure) # Test relatedness relatedness, relatedness_words = Metrics.relatedness(wordsim353, word2vec, distance_measure=distance_measure) print(u"Relatedness : {}, on {} words".format(relatedness, relatedness_words)) # end for # If we want a figure if args.image is not None: selected_words = [u"switzerland", u"france", u"italy", u"spain", u"germany", u"canada", u"belgium", u"bern", u"paris", u"rome", u"madrid", u"berlin", u"ottawa", u"brussels"] Visualization.top_words_figure(word2vec, word_embeddings, args.image, args.fig_size, args.count_limit) Visualization.words_figure(selected_words, word2vec, word_embeddings, args.image + u"_words", args.fig_size, reduction='PCA') # end if # end if
test_set.append((io.open(file_path, 'r').read(), author_index)) else: # Sentence success rate nlp = spacy.load(args.lang) doc = nlp(io.open(file_path, 'r').read()) for sentence in doc.sents: test_set.append((sentence, author_index)) # end for # end if # end for # end for # Success rate success_rate = Metrics.success_rate(classifier, test_set, verbose=args.verbose, debug=args.debug) logger.info(u"\t{} - Success rate : {}".format(k, success_rate)) # Save result success_rates[k] = success_rate # Reset classifier.reset() # end for # Over all success rate logger.info(u"All - Success rate : {}".format(np.average(success_rates))) # end if