def get_final_semeval_data(classes, train_loc, dev_loc, test_loc): """ Loads the final version of the semeval data that was merged by the following criteria: train: train13, dev13, train14, train15 dev: test13, dev14, dev15 test: test14, test15 It should be noted that some redundancy exists between each year's set :param classes: bitflags following the definitions in preprocessing :param train_loc: location of training data :param dev_loc: location of development data :param test_loc: location of test data :return: tuple of lists, following the (X, y) scheme """ train_labels, train_tweets, train_pos = parse( train_loc, classes ) dev_labels, dev_tweets, dev_pos = parse( dev_loc, classes ) test_labels, test_tweets, test_pos = parse( test_loc, classes ) train = [e[0]+'\t'+e[1] for e in zip(train_tweets, train_pos)], train_labels dev = [e[0]+'\t'+e[1] for e in zip(dev_tweets, dev_pos)], dev_labels test = [e[0]+'\t'+e[1] for e in zip(test_tweets, test_pos)], test_labels return train, dev, test
def re_train(model, model_loc, train, dev, corpus_loc, num, num_runs, classes): """ runs the back feed mechanism. :param model: classificator :param model_loc: If the model was not loaded, a raw one is trained and saved to this location :param train: File with annotated training tweets :param dev: File with annotated development tweets :param corpus_loc: Directory of the Sentiment140 corpus :param num: Amount of added corpus data per iteration :param num_runs: Number of iterations :param classes: Variable holding bit flags corresponding to class labels -- definition is in preprocessing variable 'ltd' """ # load training data, is needed either way train_labels, train_tweets, train_pos = preprocessing.parse(train, classes) x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)] y_train = train_labels # x_train, y_train = uniformify(x_train, y_train) # if model is raw, train it if not hasattr(model, "grid_scores_"): logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes)) model.fit(x_train, y_train) logging.info("writing new model to disk at %s.." % model_loc) with open(model_loc, "w") as sink: cPickle.dump(model, sink) logging.info("done.") # get test data dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes) x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)] y_dev = dev_labels # x_dev, y_dev = uniformify(x_dev, y_dev) # initial eval logging.info("Initial evaluation..") print_scores(model, x_dev, y_dev, classes) # print label distribution, in order to check on how the ratio of pos to neg # influences the scoring. Seems pretty balanced now that len(t_gold_1) is # equal to len(t_gold_2) # print('num t_gold_1 '+str(len([l for l in y_train if l == 1]))) # print('num t_gold_2 '+str(len([l for l in y_train if l == 2]))) # print('t_1 '+str(len([l for l in model.predict(x_train) if l == 1]))) # print('t_2 '+str(len([l for l in model.predict(x_train) if l == 2]))) # print('num d_gold_1 '+str(len([l for l in y_dev if l == 1]))) # print('num d_gold_2 '+str(len([l for l in y_dev if l == 2]))) # print('d_1 '+str(len([l for l in model.predict(x_dev) if l == 1]))) # print('d_2 '+str(len([l for l in model.predict(x_dev) if l == 2]))) # feedback loop logging.info("Initializing backfeed instance..") feed = Feeder(corpus_loc) logging.info("done. Now starting backfeed loop") for count in range(1, num_runs + 1): feed.add_best_n(model, num, x_train, y_train) logging.info("Retrain run %i" % count) print_scores(model, x_dev, y_dev, classes)
def test(sc, file_positive, files_negative, file_model): """ Tests a classification model using positive samples in file_positive and negative samples in file_negative. It prints the results to standard output :param sc: The spark context :type sc: SparkContext :param file_positive: The file with tweets to predict :type file_positive: str :param files_negative: The files with tweets to reject :type files_negative: list[str] :param file_model: The file where the model is located :type file_model: str """ tweets_positive = sc.textFile(file_positive).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)).cache() list_negatives = [sc.textFile(file_negative).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)) for file_negative in files_negative] tweets_negative = list_negatives[0] for ln in list_negatives[1:]: tweets_negative = tweets_negative.union(ln) try: print("Reading stored classification model") model = pickle.load(open(file_model, 'rb')) print("Computing predictions") threshold = 0.0 total_positive = tweets_positive.count() total_negative = tweets_negative.count() true_positives = tweets_positive.filter(lambda x: model.predict(parse(x)) > threshold).count() true_negatives = tweets_negative.filter(lambda x: model.predict(parse(x)) <= threshold).count() false_negatives = total_positive - true_positives false_positives = total_negative - true_negatives print("Results for %s:" % file_model) print(" Total positives: %d" % total_positive) print(" Total negatives: %d" % total_negative) print(" False positives: %d" % false_positives) print(" False negatives: %d" % false_negatives) precision = 0.0 recall = 0.0 try: precision = float(true_positives) / float(true_positives + false_positives) recall = float(true_positives) / float(true_positives + false_negatives) except: pass print(" Precision: %f" % precision) print(" Recall: %f" % recall) print("Done!") except Exception as e: print("Error:") print(e)
def get_train_data(): classes = POS | NEU | NEG train_loc = root+'Data/twitterData/train_alternative.tsv' dev_loc = root+'Data/twitterData/dev_alternative.tsv' test_loc = root+'Data/twitterData/test_alternative.tsv' train_labels, train_tweets, train_pos = parse( train_loc, classes ) dev_labels, dev_tweets, dev_pos = parse( dev_loc, classes ) test_labels, test_tweets, test_pos = parse( test_loc, classes ) semeval_train = [e[0]+'\t'+e[1] for e in zip(train_tweets, train_pos)], train_labels semeval_dev = [e[0]+'\t'+e[1] for e in zip(dev_tweets, dev_pos)], dev_labels semeval_test = [e[0]+'\t'+e[1] for e in zip(test_tweets, test_pos)], test_labels return semeval_train, semeval_dev, semeval_test
def predict_labels(tweet): x = parse(tweet) labels = [] if sports.predict(x) > 0.0: labels.append("sports") if politics.predict(x) > 0.0: labels.append("politics") if technology.predict(x): labels.append("technology") return labels
def score(model, model_loc, train, dev, classes): """ Trains and tests the model with the current feature extractors and parameters. :param model: Model to be tested, either instance of gird_search, or a path to a previously saved model :param model_loc: If the model was not loaded, a raw one is trained and saved to this location :param train: File with annotated training tweets :param dev: File with annotated development tweets :param classes: Amount of classes which get drawn from train and test sets :return: best parameters found by the current grid search """ # if model is raw, train it if not hasattr(model, "grid_scores_"): logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes)) train_labels, train_tweets, train_pos = preprocessing.parse(train, classes) x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)] y_train = train_labels # x_train, y_train = uniformify(x_train, y_train) # [u'#RonPaul campaign manager Doug Wead : Contrary to media lies , doing JUST GREAT with delegates even after Super Tuesday http://url\t# N N ^ ^ , A P N N , V R A P N R P ^ ^ U'] model.fit(x_train, y_train) logging.info("writing new model to disk at %s.." % model_loc) with open(model_loc, "w") as sink: cPickle.dump(model, sink) logging.info("done.") # get test data dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes) x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)] y_dev = dev_labels # x_dev, y_dev = uniformify(x_dev, y_dev) # test and evaluate print_scores(model, x_dev, y_dev, classes) return model.best_params_
"""This module produces SemEval Task 3, Subtask B datasets in JSON.""" from itertools import chain from json import dump from preprocessing import parse from xmlfiles import XMLFiles if __name__ == "__main__": result = {} for year, dataset in [(2016, "train"), (2016, "dev"), (2016, "test"), (2017, "test")]: with XMLFiles(year, dataset) as xmlfiles: result["%d-%s" % (year, dataset)] = list( chain(*[parse(xmlfile) for xmlfile in xmlfiles])) with open("result.json", "wb") as jsonfile: dump(result, jsonfile, sort_keys=True, indent=4)
"""This module produces unannotated SemEval Task 3, Subtask A datasets in JSON.""" from itertools import chain from json import dump from preprocessing import parse from xmlfiles import XMLFiles if __name__ == "__main__": result = [] with XMLFiles(2016, "unannotated") as xmlfiles: assert len(xmlfiles) == 1 corpus = parse(xmlfiles[0]) with open("result.json", "w") as jsonfile: dump(corpus, jsonfile, sort_keys=True, indent=4)
import time import pandas as pd from preprocessing import parse from app import appriori from helper import printResult from fp import FPGrowth from ImprovedApp import PartitionedApp start = time.time() df = pd.read_csv('adult.data.csv', names=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]) dataSet= parse(df, df.shape[0]) sec = time.time() - start print("\nData Preprocessing time: " + str(sec)+" seconds") minSup = int(input("Enter minimun relative support as a percentage (0 - 100) : ")) print('\n--------------------------------- APRIORI --------------------------------------') # #start = time.time() #feqApp = appriori(dataSet,minSup) #printResult(feqApp) #sec = time.time() - start #print("\nApriori Execution time: " + str(sec)+" seconds") #print('\n--------------------------------- FP GROWTH --------------------------------------') start = time.time() feqFP = FPGrowth(dataSet,minSup) printResult(feqFP) sec = time.time() - start print("\nFP Growth Execution time: " + str(sec)+" seconds") print('\n--------------------------------- PARTITIONED APRIORI --------------------------------') #start = time.time()
"/media/gaurish/Angela's Files/projects/semeval-2016_2017-task3-subtaskA-unannotated-english/v3.2/train/SemEval2016-Task3-CQA-QL-train-part2-subtaskA.xml", "/media/gaurish/Angela's Files/projects/semeval-2016_2017-task3-subtaskA-unannotated-english/v3.2/dev/SemEval2016-Task3-CQA-QL-dev-subtaskA.xml" ] counter =[] def write_to_file(data,output_path,mode ="w"): with open(output_path, mode, encoding='utf-8') as f: for line in data: json_record = json.dumps(line, ensure_ascii=False) f.write(json_record + '\n') if __name__ == "__main__": result = [] corpus = parse(xmlfiles[0]) (counter.extend([line["THREAD_SEQUENCE"] for line in corpus])) output_path = "train.jsonl" write_to_file(corpus,output_path) corpus = parse(xmlfiles[1]) (counter.extend([line["THREAD_SEQUENCE"] for line in corpus])) output_path = "train.jsonl" write_to_file(corpus,output_path,"a") print(Counter(counter)) corpus = parse(xmlfiles[2]) (counter.extend([line["THREAD_SEQUENCE"] for line in corpus])) output_path = "val.jsonl" write_to_file(corpus,output_path) print(Counter(counter))