def train_word2vec(categories, comments, n_dim): from feature_extraction import tokenize_document from feature_extraction import word2vec_model from sklearn.linear_model import SGDClassifier documents = [tokenize_document(document) for document in comments] model = word2vec_model(documents, n_dim) train_vecs = w2vectorize(documents, model, n_dim) classifier = SGDClassifier(loss='log', penalty='l1') classifier.fit(train_vecs, categories) return model, classifier
print "\n" print collocations print "\n" similar_words(comments, "fakeinsult") model = language_model(comments) print "\nSamples: " import pprint printer = pprint.PrettyPrinter(indent=4) printer.pprint(model["sound"].samples()) print "\n" model = word2vec_model(comments) print model.similarity('retarded', 'loser') if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'): from train import train_word2vec from train import w2vectorize from feature_extraction import tokenize_document model, classifier = train_word2vec(categories, comments, 500) test_documents = [ tokenize_document(document, stopwords='english') for document in test_comments ] test_vecs = w2vectorize(test_documents, model, 500) predictions = classifier.predict(test_vecs) print "\nWord2Vec Model Result\n" prediction_info(predictions, test_categories)
print "\n" similar_words(comments, "fakeinsult") model = language_model(comments) print "\nSamples: " import pprint printer = pprint.PrettyPrinter(indent=4) printer.pprint(model["sound"].samples()) print "\n" model = word2vec_model(comments) print model.similarity('retarded', 'loser') if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'): from train import train_word2vec from train import w2vectorize from feature_extraction import tokenize_document model, classifier = train_word2vec(categories, comments, 500) test_documents = [tokenize_document(document, stopwords='english') for document in test_comments] test_vecs = w2vectorize(test_documents, model, 500) predictions = classifier.predict(test_vecs) print "\nWord2Vec Model Result\n" prediction_info(predictions, test_categories)