def make_submission(with_model): """ Generates a submission for the leaderboard """ train_files = resources.train_data_files() train_observations = filename_to_id(train_files) train_label_dict = resources.train_data_labels() train_labels = np.array( [train_label_dict[ob_id] for ob_id in train_observations]) test_files = resources.test_data_files() test_observations = filename_to_id(test_files) preprocess_data = preprocess(with_model, train_files, test_files) trained_model = train(with_model, train_files, train_observations, train_labels, *preprocess_data) Y = predict(with_model, trained_model, test_files, test_observations, *preprocess_data) # Write the output: output_file = "{}/submission_{}.txt".format(resolve('..'), strftime("%Y-%m-%d_%H:%M:%S")) with open(output_file, 'w') as f: for (observation, result) in zip(test_observations, Y): f.write("{} {}\n".format(observation, result)) info(">> Wrote submission output to {}".format(output_file))
def parse_xml(filename): """ Parses a CoreNLP XML output, returning a tuple of (str:observation-id, [dict:<sentence-data>]) where sentences is a list of dicts, where each <sentence-data> dict has the following keys: - 'tokens' : [{'word':str, 'lemma':str, 'POS':str, 'NER':str|None}] - 'dependencies': (str, str, str) - 'sentiment' : str - 'parse' : str @returns (str:observation-id, [dict:<sentence-data>]) """ sentences = [] tree = ET.parse(filename) for s in tree.findall('.//sentence'): sentence = { 'tokens': [] ,'dependencies': [] ,'sentiment': s.get('sentiment').lower() ,'parse': s.find('.//parse').text } # Tokens in the sentence: for t in s.findall('.//token'): word = t.find('word').text.lower() lemma = t.find('lemma').text.lower() pos_tag = t.find('POS').text ner_tag = t.find('NER').text if ner_tag == "O": ner_tag = None data = {'word':word, 'lemma':lemma, 'pos':pos_tag, 'ner':ner_tag} sentence['tokens'].append(data) # Dependencies in the sentence: for dep in s.findall(".//dependencies[@type='basic-dependencies']/dep"): dep_type = dep.get('type').lower() governor = dep.find('governor').text.lower() dependent = dep.find('dependent').text.lower() data = (dep_type, governor, dependent) sentence['dependencies'].append(data) sentences.append(sentence) return (filename_to_id(filename), sentences)
def all_sentences(for_data): """ Returns a dict of all sentences data derived from CoreNLP. The key is the truncated filename (observation-ID), and the value is the output sentence data generated by parse_xml() for that particular file. @returns: {str: <sentence-data>} """ assert(for_data in ('train', 'test')) if for_data == 'train': data_cache_file = CORENLP_TRAIN_DATA_CACHE else: data_cache_file = CORENLP_TEST_DATA_CACHE # If there's cached data, load it: if exists(data_cache_file): debug('> Loading cached CoreNLP data from {}'.format(data_cache_file)) with open(data_cache_file, 'r') as f: return pickle.load(f) # Otherwise, generate the output from parse_xml() debug('> CoreNLP data {} not found; caching...'.format(data_cache_file)) if for_data == 'train': filenames = resources.train_data_files('CoreNLP') else: filenames = resources.test_data_files('CoreNLP') #if include_test: # filenames += resources.test_data_files('CoreNLP') # parse_xml(filename)[1] means to only keep the actual sentence data, # not the file name/observation identifier. Also, lops off the ".xml" part # from the CoreNLP output filename preserving the original filename data = {splitext(filename_to_id(filename))[0]: parse_xml(filename)[1] for filename in filenames} with open(data_cache_file, 'w') as f: pickle.dump(data, f) debug('> CoreNLP data cached to {}'.format(data_cache_file)) return data
def featureize(F, observation_files, CoreNLP_data): m = len(observation_files) # Observations X = np.zeros((m, 7), dtype=np.float) for (i, filename) in enumerate(observation_files, start=0): # Convert the filename to an observation ID ob_id = filename_to_id(filename) assert (ob_id in CoreNLP_data) sent_data = CoreNLP_data[ob_id] sent_count = len(sent_data) token_count = 0 ner_count = 0 noun_count = 0 over6_count = 0 quote_count = 0 #is_num_count = 0 sentiment_score = abs( sum([sentiment_to_number(sd['sentiment']) for sd in sent_data])) # Token data is a dict of the form: # {'lemma': 'count', 'ner': None, 'pos': 'NNS', 'word': 'counts'} # - or - # {'lemma': 'i.b.m.', 'ner': 'ORGANIZATION', 'pos': 'NNP', 'word': 'i.b.m.'} for token_data in chain(*[sd['tokens'] for sd in sent_data]): word = token_data['word'] if is_junk_token(word): continue # if is_number(word): # is_num_count += 1 if word == "''": quote_count += 1 if len(word) > 6: over6_count += 1 if token_data['ner'] is not None: ner_count += 1 # Count the number of nouns: # - NN Noun, singular or mass # - NNS Noun, plural # - NNP Proper noun, singular # - NNPS Proper noun, plural pos = token_data['pos'] if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS': noun_count += 1 token_count += 1 X[i][0] = sent_count X[i][1] = token_count X[i][2] = ner_count X[i][3] = noun_count X[i][4] = over6_count X[i][5] = sentiment_score X[i][6] = quote_count #X[i][7] = is_num_count return X
def test(model_name, test_size=0.1, suppress_output=False, show_results=False, *args, **kwargs): """ Runs a full test cycle for the given model Options: n_folds: Number of cross-validation folds to produce suppress_output: If True, no output will be produced show_results: If True, individual prediction results will be printed. suppress_output must also be False for this option to work @returns (float:accuracy, int:correct_count, int:incorrect_count) """ observations = resources.train_data_files('text') labels = resources.train_data_labels() # Fed into sklearn cross_validation Y = np.array([labels[ob_id] for ob_id in filename_to_id(observations)]) # Divide the observation data into two sets for training and testing: #(train_files, test_files, hold_out_fold) = nfold_xval(observations, n=n_folds) (train_files, test_files, train_labels, true_labels) = \ cross_validation.train_test_split(observations, Y, test_size=test_size) assert (len(train_files) == len(train_labels) and len(test_files) == len(true_labels)) info("> test size: {}, |train| (kept): {}, |test| (held out): {}"\ .format(test_size, len(train_files), len(test_files))) # Get any preprocessing data and pass it to train() anbd predict() later: data = preprocess(model_name, train_files, test_files) # Generate training features: trained_model = train(model_name \ ,train_files \ ,filename_to_id(train_files) \ ,train_labels \ ,*data) # Same as training: the observation ID is just the basename of the input test_observation_ids = filename_to_id(test_files) # Use the trained model to make predictions: predicted_labels = predict(model_name \ ,trained_model \ ,test_files \ ,test_observation_ids \ ,*data) accuracy = metrics.accuracy_score(true_labels, predicted_labels) cm = metrics.confusion_matrix(true_labels, predicted_labels) f1_score = metrics.f1_score(true_labels, predicted_labels) correct = cm[0][0] + cm[1][1] incorrect = cm[1][0] + cm[0][1] incorrect_observations = set() if not suppress_output: line = '*' * 80 print print line print "Accuracy: {}%".format(accuracy * 100.0) print "F1-Score: {}".format(f1_score) print "Confusion matrix:\n", cm print "Incorrect labelled as 1: {}; Incorrect labelled as -1: {}".format( cm[1][0], cm[0][1]) print "Incorrect:" for i in range(len(test_observation_ids)): if true_labels[i] != predicted_labels[i]: print "TRUE: {}, PREDICTED: {}, LEAD: {}".format( true_labels[i], predicted_labels[i], test_observation_ids[i]) incorrect_observations.add(test_observation_ids[i]) print print line print return (accuracy, correct, incorrect, incorrect_observations)
def featureize(F, observation_files, CoreNLP_data): m = len(observation_files) # Observations X = np.zeros((m, 7), dtype=np.float) for (i,filename) in enumerate(observation_files,start=0): # Convert the filename to an observation ID ob_id = filename_to_id(filename) assert(ob_id in CoreNLP_data) sent_data = CoreNLP_data[ob_id] sent_count = len(sent_data) token_count = 0 ner_count = 0 noun_count = 0 over6_count = 0 quote_count = 0 #is_num_count = 0 sentiment_score = abs(sum([sentiment_to_number(sd['sentiment']) for sd in sent_data])) # Token data is a dict of the form: # {'lemma': 'count', 'ner': None, 'pos': 'NNS', 'word': 'counts'} # - or - # {'lemma': 'i.b.m.', 'ner': 'ORGANIZATION', 'pos': 'NNP', 'word': 'i.b.m.'} for token_data in chain(*[sd['tokens'] for sd in sent_data]): word = token_data['word'] if is_junk_token(word): continue # if is_number(word): # is_num_count += 1 if word == "''": quote_count += 1 if len(word) > 6: over6_count += 1 if token_data['ner'] is not None: ner_count += 1 # Count the number of nouns: # - NN Noun, singular or mass # - NNS Noun, plural # - NNP Proper noun, singular # - NNPS Proper noun, plural pos = token_data['pos'] if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS': noun_count += 1 token_count += 1 X[i][0] = sent_count X[i][1] = token_count X[i][2] = ner_count X[i][3] = noun_count X[i][4] = over6_count X[i][5] = sentiment_score X[i][6] = quote_count #X[i][7] = is_num_count return X