def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform( features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def to_weka_arff(ngram, number_of_features): count_vect = TfidfVectorizer(ngram_range=(1, ngram), norm='l2', sublinear_tf=True) label_list = get_labels() tweet_list = get_labelled_tweets() features = count_vect.fit_transform(tweet_list) features = SelectKBest(chi2, k=number_of_features).fit_transform(features, label_list) print features.shape arff_data = [] arff_data.append("@RELATION sport") for i in range(features.shape[1]): arff_data.append("@ATTRIBUTE feature" + str(i) + " REAL") arff_data.append("@ATTRIBUTE sportclass {neutral,neg,pos}") arff_data.append("@DATA") array_features = features.toarray() for i in range(len(array_features)): feature = array_features[i] label = label_list[i] csv_feature = ",".join(str(x) for x in feature) csv_feature = csv_feature + "," + label arff_data.append(csv_feature) with open('data/sport.arff', 'w') as file: for item in arff_data: file.write("%s\n" % item)
def find_and_save_timings(): tweet_list = get_labelled_tweets() num_tweets = len(tweet_list) setup = """ from data_source import get_labelled_tweets, get_labels; from sklearn.externals import joblib; tweet_list = get_labelled_tweets(); # do transformation into vector; vectoriser = joblib.load('model/tfidf_vectoriser.pkl'); vectorised_tweet_list = vectoriser.transform(tweet_list); svm_model = joblib.load('model/tfidf_linsvc.pkl'); svm_model.predict(vectorised_tweet_list); """ test_statement = 'svm_model.predict(vectorised_tweet_list)' REPETITIONS = 100 # check timing of svm # time in micro seconds svm_time = timeit.timeit(stmt=test_statement, setup=setup, number=REPETITIONS) svm_time_dataset = get_dataset_time(svm_time, REPETITIONS) svm_time_record = get_record_time(svm_time_dataset, num_tweets) setup_ensemble = """ import cPickle; from data_source import get_labelled_tweets; from sklearn.externals import joblib; tweet_list = get_labelled_tweets(); vectoriser = joblib.load('model/tfidf_vectoriser.pkl'); vectorised_tweet_list = vectoriser.transform(tweet_list); with open('model/tfidf_ada.pickle', 'rb') as f: ensemble_model = cPickle.load(f); ensemble_model.predict(vectorised_tweet_list); """ test_statement_ensemble = 'ensemble_model.predict(vectorised_tweet_list)' ensemble_time = timeit.timeit(stmt=test_statement_ensemble, setup=setup_ensemble, number=REPETITIONS) ens_time_dataset = get_dataset_time(ensemble_time, REPETITIONS) ens_time_record = get_record_time(ens_time_dataset, num_tweets) # save results in a txt file create_directory('metric_result') with open("metric_result/" + 'timings' + ".txt", "w") as text_file: text_file.write("Number of records in dataset: {0}\n".format(num_tweets)) text_file.write("Svm dataset time: {0}\n".format(svm_time_dataset)) text_file.write("Svm record time: {0}\n".format(svm_time_record)) text_file.write("Ensemble dataset time: {0}\n".format(ens_time_dataset)) text_file.write("Ensemble record time: {0}\n".format(ens_time_record))
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit( train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
def ensemble_classify(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector vectoriser.fit(tweet_list) vectorised_tweet_list = vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) n_estimators = 10 # number of weak learners model = AdaBoostClassifier(n_estimators=n_estimators) ada_classifier = model.fit(train_vector, train_labels) result = ada_classifier.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/tfidf_ada.csv", sep=',') save_model(ada_classifier, 'tfidf_ada') # evaluation binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)