Пример #1
0
def make_submission(with_model):
    """
    Generates a submission for the leaderboard
    """

    train_files = resources.train_data_files()
    train_observations = filename_to_id(train_files)
    train_label_dict = resources.train_data_labels()
    train_labels = np.array(
        [train_label_dict[ob_id] for ob_id in train_observations])

    test_files = resources.test_data_files()
    test_observations = filename_to_id(test_files)

    preprocess_data = preprocess(with_model, train_files, test_files)

    trained_model = train(with_model, train_files, train_observations,
                          train_labels, *preprocess_data)

    Y = predict(with_model, trained_model, test_files, test_observations,
                *preprocess_data)

    # Write the output:

    output_file = "{}/submission_{}.txt".format(resolve('..'),
                                                strftime("%Y-%m-%d_%H:%M:%S"))

    with open(output_file, 'w') as f:
        for (observation, result) in zip(test_observations, Y):
            f.write("{} {}\n".format(observation, result))

    info(">> Wrote submission output to {}".format(output_file))
Пример #2
0
def parse_xml(filename):
    """
    Parses a CoreNLP XML output, returning a tuple of 
    (str:observation-id, [dict:<sentence-data>]) where sentences is a list of 
    dicts, where each <sentence-data> dict has the following keys:

    - 'tokens' : [{'word':str, 'lemma':str, 'POS':str, 'NER':str|None}]
    - 'dependencies': (str, str, str)
    - 'sentiment' : str
    - 'parse' : str

    @returns (str:observation-id, [dict:<sentence-data>])
    """
    sentences = []
    tree = ET.parse(filename)

    for s in tree.findall('.//sentence'):

        sentence = {
             'tokens': []
            ,'dependencies': []
            ,'sentiment': s.get('sentiment').lower()
            ,'parse': s.find('.//parse').text
        }

        # Tokens in the sentence:
        for t in s.findall('.//token'):

            word    = t.find('word').text.lower()
            lemma   = t.find('lemma').text.lower()
            pos_tag = t.find('POS').text
            ner_tag = t.find('NER').text
            if ner_tag == "O":
                ner_tag = None

            data = {'word':word, 'lemma':lemma, 'pos':pos_tag, 'ner':ner_tag}
            sentence['tokens'].append(data)

        # Dependencies in the sentence:
        for dep in s.findall(".//dependencies[@type='basic-dependencies']/dep"):
            dep_type  = dep.get('type').lower()
            governor  = dep.find('governor').text.lower()
            dependent = dep.find('dependent').text.lower()

            data = (dep_type, governor, dependent)
            sentence['dependencies'].append(data)

        sentences.append(sentence)

    return (filename_to_id(filename), sentences)
Пример #3
0
def all_sentences(for_data):
    """
    Returns a dict of all sentences data derived from CoreNLP. The key
    is the truncated filename (observation-ID), and the value is the output
    sentence data generated by parse_xml() for that particular file.

    @returns: {str: <sentence-data>}
    """
    assert(for_data in ('train', 'test'))

    if for_data == 'train':
        data_cache_file = CORENLP_TRAIN_DATA_CACHE
    else:
        data_cache_file = CORENLP_TEST_DATA_CACHE

    # If there's cached data, load it:
    if exists(data_cache_file):

        debug('> Loading cached CoreNLP data from {}'.format(data_cache_file))

        with open(data_cache_file, 'r') as f:
            return pickle.load(f)

    # Otherwise, generate the output from parse_xml()
    debug('> CoreNLP data {} not found; caching...'.format(data_cache_file))

    if for_data == 'train':
        filenames = resources.train_data_files('CoreNLP')
    else:
        filenames = resources.test_data_files('CoreNLP')

    #if include_test:
    #    filenames += resources.test_data_files('CoreNLP')

    # parse_xml(filename)[1] means to only keep the actual sentence data,
    # not the file name/observation identifier. Also, lops off the ".xml" part
    # from the CoreNLP output filename preserving the original filename
    data = {splitext(filename_to_id(filename))[0]: parse_xml(filename)[1] for filename in filenames}

    with open(data_cache_file, 'w') as f:
        pickle.dump(data, f)

    debug('> CoreNLP data cached to {}'.format(data_cache_file))

    return data
Пример #4
0
def featureize(F, observation_files, CoreNLP_data):

    m = len(observation_files)

    # Observations
    X = np.zeros((m, 7), dtype=np.float)

    for (i, filename) in enumerate(observation_files, start=0):

        # Convert the filename to an observation ID
        ob_id = filename_to_id(filename)
        assert (ob_id in CoreNLP_data)

        sent_data = CoreNLP_data[ob_id]
        sent_count = len(sent_data)
        token_count = 0
        ner_count = 0
        noun_count = 0
        over6_count = 0
        quote_count = 0
        #is_num_count    = 0
        sentiment_score = abs(
            sum([sentiment_to_number(sd['sentiment']) for sd in sent_data]))

        # Token data is a dict of the form:
        #   {'lemma': 'count', 'ner': None, 'pos': 'NNS', 'word': 'counts'}
        #     - or -
        #   {'lemma': 'i.b.m.', 'ner': 'ORGANIZATION', 'pos': 'NNP', 'word': 'i.b.m.'}
        for token_data in chain(*[sd['tokens'] for sd in sent_data]):

            word = token_data['word']

            if is_junk_token(word):
                continue

            # if is_number(word):
            #     is_num_count += 1

            if word == "''":
                quote_count += 1

            if len(word) > 6:
                over6_count += 1

            if token_data['ner'] is not None:
                ner_count += 1

            # Count the number of nouns:
            # - NN Noun, singular or mass
            # - NNS Noun, plural
            # - NNP Proper noun, singular
            # - NNPS Proper noun, plural
            pos = token_data['pos']
            if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS':
                noun_count += 1

            token_count += 1

        X[i][0] = sent_count
        X[i][1] = token_count
        X[i][2] = ner_count
        X[i][3] = noun_count
        X[i][4] = over6_count
        X[i][5] = sentiment_score
        X[i][6] = quote_count
        #X[i][7] = is_num_count

    return X
Пример #5
0
def test(model_name,
         test_size=0.1,
         suppress_output=False,
         show_results=False,
         *args,
         **kwargs):
    """
    Runs a full test cycle for the given model

    Options:

    n_folds:         Number of cross-validation folds to produce
    suppress_output: If True, no output will be produced
    show_results:    If True, individual prediction results will be printed.
                     suppress_output must also be False for this option to work

    @returns (float:accuracy, int:correct_count, int:incorrect_count)
    """
    observations = resources.train_data_files('text')
    labels = resources.train_data_labels()

    # Fed into sklearn cross_validation
    Y = np.array([labels[ob_id] for ob_id in filename_to_id(observations)])

    # Divide the observation data into two sets for training and testing:
    #(train_files, test_files, hold_out_fold) = nfold_xval(observations, n=n_folds)

    (train_files, test_files, train_labels, true_labels) = \
        cross_validation.train_test_split(observations, Y, test_size=test_size)

    assert (len(train_files) == len(train_labels)
            and len(test_files) == len(true_labels))

    info("> test size: {}, |train| (kept): {}, |test| (held out): {}"\
         .format(test_size, len(train_files), len(test_files)))

    # Get any preprocessing data and pass it to train() anbd predict() later:
    data = preprocess(model_name, train_files, test_files)

    # Generate training features:
    trained_model = train(model_name \
                         ,train_files \
                         ,filename_to_id(train_files) \
                         ,train_labels \
                         ,*data)

    # Same as training: the observation ID is just the basename of the input
    test_observation_ids = filename_to_id(test_files)

    # Use the trained model to make predictions:
    predicted_labels = predict(model_name \
                              ,trained_model \
                              ,test_files \
                              ,test_observation_ids \
                              ,*data)

    accuracy = metrics.accuracy_score(true_labels, predicted_labels)
    cm = metrics.confusion_matrix(true_labels, predicted_labels)
    f1_score = metrics.f1_score(true_labels, predicted_labels)
    correct = cm[0][0] + cm[1][1]
    incorrect = cm[1][0] + cm[0][1]
    incorrect_observations = set()

    if not suppress_output:
        line = '*' * 80
        print
        print line
        print "Accuracy: {}%".format(accuracy * 100.0)
        print "F1-Score: {}".format(f1_score)
        print "Confusion matrix:\n", cm
        print "Incorrect labelled as 1: {}; Incorrect labelled as -1: {}".format(
            cm[1][0], cm[0][1])
        print "Incorrect:"
        for i in range(len(test_observation_ids)):
            if true_labels[i] != predicted_labels[i]:
                print "TRUE: {}, PREDICTED: {}, LEAD: {}".format(
                    true_labels[i], predicted_labels[i],
                    test_observation_ids[i])
                incorrect_observations.add(test_observation_ids[i])
        print
        print line
        print

    return (accuracy, correct, incorrect, incorrect_observations)
def featureize(F, observation_files, CoreNLP_data):

    m = len(observation_files)

    # Observations
    X = np.zeros((m, 7), dtype=np.float)

    for (i,filename) in enumerate(observation_files,start=0):

        # Convert the filename to an observation ID
        ob_id = filename_to_id(filename)
        assert(ob_id in CoreNLP_data)

        sent_data       = CoreNLP_data[ob_id]
        sent_count      = len(sent_data)
        token_count     = 0
        ner_count       = 0
        noun_count      = 0
        over6_count     = 0
        quote_count     = 0
        #is_num_count    = 0
        sentiment_score = abs(sum([sentiment_to_number(sd['sentiment']) for sd in sent_data]))

        # Token data is a dict of the form:
        #   {'lemma': 'count', 'ner': None, 'pos': 'NNS', 'word': 'counts'}
        #     - or -
        #   {'lemma': 'i.b.m.', 'ner': 'ORGANIZATION', 'pos': 'NNP', 'word': 'i.b.m.'}
        for token_data in chain(*[sd['tokens'] for sd in sent_data]):

            word = token_data['word']

            if is_junk_token(word):
               continue

            # if is_number(word):
            #     is_num_count += 1

            if word == "''":
                quote_count += 1

            if len(word) > 6:
                over6_count += 1

            if token_data['ner'] is not None:
                ner_count += 1

            # Count the number of nouns:
            # - NN Noun, singular or mass
            # - NNS Noun, plural
            # - NNP Proper noun, singular
            # - NNPS Proper noun, plural
            pos = token_data['pos']
            if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS':
                noun_count += 1

            token_count += 1

        X[i][0] = sent_count
        X[i][1] = token_count
        X[i][2] = ner_count
        X[i][3] = noun_count
        X[i][4] = over6_count
        X[i][5] = sentiment_score
        X[i][6] = quote_count
        #X[i][7] = is_num_count

    return X