Пример #1
0
def get_final_semeval_data(classes, train_loc, dev_loc, test_loc):
    """
    Loads the final version of the semeval data that was merged by the following criteria:
    train: train13, dev13, train14, train15
    dev: test13, dev14, dev15
    test: test14, test15
    It should be noted that some redundancy exists between each year's set
    :param classes: bitflags following the definitions in preprocessing
    :param train_loc: location of training data
    :param dev_loc: location of development data
    :param test_loc: location of test data
    :return: tuple of lists, following the (X, y) scheme
    """
    train_labels, train_tweets, train_pos = parse(
        train_loc, classes
    )
    dev_labels, dev_tweets, dev_pos = parse(
        dev_loc, classes
    )
    test_labels, test_tweets, test_pos = parse(
        test_loc, classes
    )
    train = [e[0]+'\t'+e[1] for e in zip(train_tweets, train_pos)], train_labels
    dev = [e[0]+'\t'+e[1] for e in zip(dev_tweets, dev_pos)], dev_labels
    test = [e[0]+'\t'+e[1] for e in zip(test_tweets, test_pos)], test_labels
    return train, dev, test
Пример #2
0
def re_train(model, model_loc, train, dev, corpus_loc, num, num_runs, classes):
    """
    runs the back feed mechanism.
    :param model: classificator
    :param model_loc: If the model was not loaded, a raw one is trained and
    saved to this location
    :param train: File with annotated training tweets
    :param dev: File with annotated development tweets
    :param corpus_loc: Directory of the Sentiment140 corpus
    :param num: Amount of added corpus data per iteration
    :param num_runs: Number of iterations
    :param classes: Variable holding bit flags corresponding to class labels --
    definition is in preprocessing variable 'ltd'
    """
    # load training data, is needed either way
    train_labels, train_tweets, train_pos = preprocessing.parse(train, classes)
    x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)]
    y_train = train_labels
    # x_train, y_train = uniformify(x_train, y_train)

    # if model is raw, train it
    if not hasattr(model, "grid_scores_"):
        logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes))
        model.fit(x_train, y_train)

        logging.info("writing new model to disk at %s.." % model_loc)
        with open(model_loc, "w") as sink:
            cPickle.dump(model, sink)
        logging.info("done.")

    # get test data
    dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes)
    x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)]
    y_dev = dev_labels
    # x_dev, y_dev = uniformify(x_dev, y_dev)

    # initial eval
    logging.info("Initial evaluation..")
    print_scores(model, x_dev, y_dev, classes)

    # print label distribution, in order to check on how the ratio of pos to neg
    # influences the scoring. Seems pretty balanced now that len(t_gold_1) is
    # equal to len(t_gold_2)
    # print('num t_gold_1 '+str(len([l for l in y_train if l == 1])))
    # print('num t_gold_2 '+str(len([l for l in y_train if l == 2])))
    # print('t_1 '+str(len([l for l in model.predict(x_train) if l == 1])))
    # print('t_2 '+str(len([l for l in model.predict(x_train) if l == 2])))
    # print('num d_gold_1 '+str(len([l for l in y_dev if l == 1])))
    # print('num d_gold_2 '+str(len([l for l in y_dev if l == 2])))
    # print('d_1 '+str(len([l for l in model.predict(x_dev) if l == 1])))
    # print('d_2 '+str(len([l for l in model.predict(x_dev) if l == 2])))

    # feedback loop
    logging.info("Initializing backfeed instance..")
    feed = Feeder(corpus_loc)
    logging.info("done. Now starting backfeed loop")
    for count in range(1, num_runs + 1):
        feed.add_best_n(model, num, x_train, y_train)
        logging.info("Retrain run %i" % count)
        print_scores(model, x_dev, y_dev, classes)
Пример #3
0
def test(sc, file_positive, files_negative, file_model):
    """
    Tests a classification model using positive samples in file_positive and
    negative samples in file_negative. It prints the results to standard output

    :param sc: The spark context
    :type sc: SparkContext
    :param file_positive: The file with tweets to predict
    :type file_positive: str
    :param files_negative: The files with tweets to reject
    :type files_negative: list[str]
    :param file_model: The file where the model is located
    :type file_model: str
    """
    tweets_positive = sc.textFile(file_positive).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)).cache()
    list_negatives = [sc.textFile(file_negative).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)) for file_negative in files_negative]
    tweets_negative = list_negatives[0]
    for ln in list_negatives[1:]:
        tweets_negative = tweets_negative.union(ln)
    try:
        print("Reading stored classification model")
        model = pickle.load(open(file_model, 'rb'))
        print("Computing predictions")
        threshold = 0.0
        total_positive = tweets_positive.count()
        total_negative = tweets_negative.count()
        true_positives = tweets_positive.filter(lambda x: model.predict(parse(x)) > threshold).count()
        true_negatives = tweets_negative.filter(lambda x: model.predict(parse(x)) <= threshold).count()
        false_negatives = total_positive - true_positives
        false_positives = total_negative - true_negatives
        print("Results for %s:" % file_model)
        print("  Total positives: %d" % total_positive)
        print("  Total negatives: %d" % total_negative)
        print("  False positives: %d" % false_positives)
        print("  False negatives: %d" % false_negatives)
        precision = 0.0
        recall = 0.0
        try:
            precision = float(true_positives) / float(true_positives + false_positives)
            recall = float(true_positives) / float(true_positives + false_negatives)
        except:
            pass
        print("  Precision: %f" % precision)
        print("  Recall: %f" % recall)
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)
Пример #4
0
def get_train_data():
    classes = POS | NEU | NEG
    train_loc = root+'Data/twitterData/train_alternative.tsv'
    dev_loc = root+'Data/twitterData/dev_alternative.tsv'
    test_loc = root+'Data/twitterData/test_alternative.tsv'
    train_labels, train_tweets, train_pos = parse(
        train_loc, classes
    )
    dev_labels, dev_tweets, dev_pos = parse(
        dev_loc, classes
    )
    test_labels, test_tweets, test_pos = parse(
        test_loc, classes
    )
    semeval_train = [e[0]+'\t'+e[1] for e in zip(train_tweets, train_pos)], train_labels
    semeval_dev = [e[0]+'\t'+e[1] for e in zip(dev_tweets, dev_pos)], dev_labels
    semeval_test = [e[0]+'\t'+e[1] for e in zip(test_tweets, test_pos)], test_labels
    return semeval_train, semeval_dev, semeval_test
Пример #5
0
 def predict_labels(tweet):
     x = parse(tweet)
     labels = []
     if sports.predict(x) > 0.0:
         labels.append("sports")
     if politics.predict(x) > 0.0:
         labels.append("politics")
     if technology.predict(x):
         labels.append("technology")
     return labels
Пример #6
0
 def predict_labels(tweet):
     x = parse(tweet)
     labels = []
     if sports.predict(x) > 0.0:
         labels.append("sports")
     if politics.predict(x) > 0.0:
         labels.append("politics")
     if technology.predict(x):
         labels.append("technology")
     return labels
Пример #7
0
def score(model, model_loc, train, dev, classes):
    """
    Trains and tests the model with the current feature extractors and
    parameters.
    :param model: Model to be tested, either instance of gird_search, or a path
    to a previously saved model
    :param model_loc: If the model was not loaded, a raw one is trained and
    saved to this location
    :param train: File with annotated training tweets
    :param dev: File with annotated development tweets
    :param classes: Amount of classes which get drawn from train and test sets
    :return: best parameters found by the current grid search
    """
    # if model is raw, train it
    if not hasattr(model, "grid_scores_"):
        logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes))
        train_labels, train_tweets, train_pos = preprocessing.parse(train, classes)
        x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)]
        y_train = train_labels
        # x_train, y_train = uniformify(x_train, y_train)
        # [u'#RonPaul campaign manager Doug Wead : Contrary to media lies , doing JUST GREAT with delegates even after Super Tuesday http://url\t# N N ^ ^ , A P N N , V R A P N R P ^ ^ U']
        model.fit(x_train, y_train)

        logging.info("writing new model to disk at %s.." % model_loc)
        with open(model_loc, "w") as sink:
            cPickle.dump(model, sink)
        logging.info("done.")

    # get test data
    dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes)
    x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)]
    y_dev = dev_labels
    # x_dev, y_dev = uniformify(x_dev, y_dev)

    # test and evaluate
    print_scores(model, x_dev, y_dev, classes)
    return model.best_params_
Пример #8
0
"""This module produces SemEval Task 3, Subtask B datasets in JSON."""

from itertools import chain
from json import dump

from preprocessing import parse
from xmlfiles import XMLFiles

if __name__ == "__main__":
    result = {}
    for year, dataset in [(2016, "train"), (2016, "dev"), (2016, "test"),
                          (2017, "test")]:
        with XMLFiles(year, dataset) as xmlfiles:
            result["%d-%s" % (year, dataset)] = list(
                chain(*[parse(xmlfile) for xmlfile in xmlfiles]))
    with open("result.json", "wb") as jsonfile:
        dump(result, jsonfile, sort_keys=True, indent=4)
"""This module produces unannotated SemEval Task 3, Subtask A datasets in JSON."""

from itertools import chain
from json import dump

from preprocessing import parse
from xmlfiles import XMLFiles

if __name__ == "__main__":
    result = []
    with XMLFiles(2016, "unannotated") as xmlfiles:
        assert len(xmlfiles) == 1
        corpus = parse(xmlfiles[0])
    with open("result.json", "w") as jsonfile:
        dump(corpus, jsonfile, sort_keys=True, indent=4)
Пример #10
0
import time
import pandas as pd
from preprocessing import parse
from app import appriori
from helper import printResult
from fp import FPGrowth
from ImprovedApp import PartitionedApp

start = time.time()
df = pd.read_csv('adult.data.csv', names=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])
dataSet= parse(df, df.shape[0])
sec = time.time() - start
print("\nData Preprocessing time: " + str(sec)+" seconds")
minSup = int(input("Enter minimun relative support as a percentage (0 - 100) : "))
print('\n--------------------------------- APRIORI --------------------------------------')
#
#start = time.time()
#feqApp = appriori(dataSet,minSup)
#printResult(feqApp)
#sec = time.time() - start
#print("\nApriori Execution time: " + str(sec)+" seconds")
#print('\n--------------------------------- FP GROWTH --------------------------------------')

start = time.time()
feqFP = FPGrowth(dataSet,minSup)
printResult(feqFP)
sec = time.time() - start
print("\nFP Growth Execution time: " + str(sec)+" seconds")
print('\n--------------------------------- PARTITIONED APRIORI --------------------------------')

#start = time.time()
    "/media/gaurish/Angela's Files/projects/semeval-2016_2017-task3-subtaskA-unannotated-english/v3.2/train/SemEval2016-Task3-CQA-QL-train-part2-subtaskA.xml",
    "/media/gaurish/Angela's Files/projects/semeval-2016_2017-task3-subtaskA-unannotated-english/v3.2/dev/SemEval2016-Task3-CQA-QL-dev-subtaskA.xml"
]

counter =[]

def write_to_file(data,output_path,mode ="w"):
    
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')

if __name__ == "__main__":
    result = []
    corpus = parse(xmlfiles[0])
    (counter.extend([line["THREAD_SEQUENCE"] for line in corpus]))
    output_path = "train.jsonl"
    write_to_file(corpus,output_path)

    corpus = parse(xmlfiles[1])
    (counter.extend([line["THREAD_SEQUENCE"] for line in corpus]))
    output_path = "train.jsonl"
    write_to_file(corpus,output_path,"a")
    print(Counter(counter))

    corpus = parse(xmlfiles[2])
    (counter.extend([line["THREAD_SEQUENCE"] for line in corpus]))
    output_path = "val.jsonl"
    write_to_file(corpus,output_path)
    print(Counter(counter))