def filterArticles(articles): relevant_articles = {} correct = [0] * (len(int2tags) - 1) gold_num = [0] * (len(int2tags) - 1) filtered_correct = [0] * (len(int2tags) - 1) filtered_gold_num = [0] * (len(int2tags) - 1) helper.load_constants() print "Num incidents", len(incidents) print "Num unfilitered articles", len(articles) for incident_id in incidents.keys(): incident = incidents[incident_id] if not 'citations' in incident: continue for citation_ind, citation in enumerate(incident['citations']): saveFile = "../data/raw_data/" + incident_id + "_" + str( citation_ind) + ".raw" print "checking it for savefile", saveFile if not saveFile in articles: continue article = tokenizer.tokenize(articles[saveFile]) ents = [incident[e.replace('-', '_')] for e in int2tags[1:]] tags, cleanArticle = getTags(article, ents) ##Calculate scores for filitered and unflitered articles for i in range(1, len(int2tags)): correct[i - 1] += 1 if i in tags else 0 gold_num[i - 1] += ents[i - 1].strip().lower() not in ["unknown"] if len(set(tags)) > 2: ##This is the filtering for i in range(1, len(int2tags)): filtered_correct[i - 1] += 1 if i in tags else 0 filtered_gold_num[i - 1] += ents[i - 1].strip().lower() not in [ "unknown" ] #Store article in convenient format to writing to tagfile relavant_article = {} relavant_article['tokens'] = cleanArticle[:1000] relavant_article['tags'] = tags relavant_article['title'] = citation['Title'] relavant_article['ents'] = [cleanDelimiters(e) for e in ents] relevant_articles[saveFile] = relavant_article pickle.dump(relevant_articles, open('EMA_filtered_articles.2.p', 'wb')) oracle_scores = [(correct[i] * 1. / gold_num[i], int2tags[i + 1]) if gold_num[i] > 0 else 0 for i in range(len(correct))] filtered_oracle_scores = [ (filtered_correct[i] * 1. / filtered_gold_num[i], int2tags[i + 1]) if filtered_gold_num[i] > 0 else 0 for i in range(len(correct)) ] print "num articles is", len(relevant_articles) print "oracle scores", oracle_scores print "filtered_oracle_scores", filtered_oracle_scores return relevant_articles
def filterArticles(articles): relevant_articles = {} correct = [0] * (len(int2tags) -1 ) gold_num = [0] * (len(int2tags)-1) filtered_correct = [0] * (len(int2tags) -1 ) filtered_gold_num = [0] * (len(int2tags)-1) helper.load_constants() print "Num incidents", len(incidents) print "Num unfilitered articles", len(articles) for incident_id in incidents.keys(): incident = incidents[incident_id] if not 'citations' in incident: continue for citation_ind, citation in enumerate(incident['citations']): saveFile = "../data/raw_data/"+ incident_id+"_"+str(citation_ind)+".raw" print "checking it for savefile", saveFile if not saveFile in articles: continue article = tokenizer.tokenize(articles[saveFile]) ents = [incident[e.replace('-','_')] for e in int2tags[1:]] tags, cleanArticle = getTags(article, ents) ##Calculate scores for filitered and unflitered articles for i in range(1, len(int2tags) ): correct[i-1] += 1 if i in tags else 0 gold_num[i-1] += ents[i-1].strip().lower() not in ["unknown"] if len(set(tags)) > 2: ##This is the filtering for i in range(1, len(int2tags) ): filtered_correct[i-1] += 1 if i in tags else 0 filtered_gold_num[i-1] += ents[i-1].strip().lower() not in ["unknown"] #Store article in convenient format to writing to tagfile relavant_article = {} relavant_article['tokens'] = cleanArticle[:1000] relavant_article['tags'] = tags relavant_article['title'] = citation['Title'] relavant_article['ents'] = [cleanDelimiters(e) for e in ents] relevant_articles[saveFile] = relavant_article pickle.dump(relevant_articles, open('EMA_filtered_articles.2.p', 'wb')) oracle_scores = [(correct[i]*1./gold_num[i], int2tags[i+1]) if gold_num[i] > 0 else 0 for i in range(len(correct))] filtered_oracle_scores = [(filtered_correct[i]*1./filtered_gold_num[i], int2tags[i+1]) if filtered_gold_num[i] > 0 else 0 for i in range(len(correct))] print "num articles is", len(relevant_articles) print "oracle scores", oracle_scores print "filtered_oracle_scores", filtered_oracle_scores return relevant_articles
def main(training_file, trained_model, previous_n, next_n, c, prune, test_file): helper.load_constants() train_data, identifier = load_data(training_file) test_data, test_ident = load_data(test_file) ## extract features tic = time.clock() print "get word_vocab" num_words, word_vocab = get_word_vocab(train_data, prune) print "feature extract for train" trainX, trainY = get_feature_matrix_n(previous_n, next_n, train_data, num_words, word_vocab, helper.other_features) print 'feature extract for test' testX, testY = get_feature_matrix_n(previous_n, next_n, test_data, num_words, word_vocab, helper.other_features) print time.clock() - tic ## train LR print("training") tic = time.clock() clf = LogisticRegression(C=c, multi_class='multinomial', solver='lbfgs') clf.fit(trainX, trainY) print time.clock() - tic print "predicting" predictY = clf.predict(testX) assert len(predictY) == len(testY) print "evaluating" evaluatePredictions(predictY, testY) feature_list = (word_vocab.keys() + helper.other_features) * ( previous_n + next_n + 1) + word_vocab.keys() + ['previous_one'] * len( tags) + ['previous_two'] * len(tags) + ['previous_three' ] * len(tags) # getTopFeatures(clf,tags,feature_list) if trained_model != "": pickle.dump( [clf, previous_n, next_n, word_vocab, helper.other_features], open(trained_model, "wb")) return [clf, previous_n, next_n, word_vocab, helper.other_features]
def main(training_file,trained_model,previous_n,next_n, c, prune, test_file): helper.load_constants() train_data, identifier = load_data(training_file) test_data, test_ident = load_data(test_file) ## extract features tic = time.clock() print "get word_vocab" num_words, word_vocab = get_word_vocab(train_data, prune) print "feature extract for train" trainX, trainY = get_feature_matrix_n(previous_n,next_n,train_data, num_words, word_vocab, helper.other_features) print 'feature extract for test' testX, testY = get_feature_matrix_n(previous_n, next_n, test_data, num_words, word_vocab, helper.other_features) print time.clock()-tic ## train LR print("training") tic = time.clock() clf = LogisticRegression(C=c, multi_class='multinomial', solver='lbfgs') clf.fit(trainX,trainY) print time.clock()-tic print "predicting" predictY = clf.predict(testX) assert len(predictY) == len(testY) print "evaluating" evaluatePredictions(predictY, testY) feature_list = (word_vocab.keys() + helper.other_features) * (previous_n+next_n+1) + word_vocab.keys() + ['previous_one'] * len(tags) + ['previous_two'] * len(tags)+ ['previous_three'] * len(tags) # getTopFeatures(clf,tags,feature_list) if trained_model != "": pickle.dump([clf, previous_n,next_n, word_vocab,helper.other_features], open( trained_model, "wb" ) ) return [clf, previous_n,next_n, word_vocab,helper.other_features]
from train import load_data import helper import re, pdb, collections import constants import re p = inflect.engine() int2tags = [ 'TAG' ] + constants.int2tags #since the constants file does not include the 'TAG' tag NUM_ENTITIES = len(constants.int2tags) tags2int = constants.tags2int tags = range(len(int2tags)) helper.load_constants() mode = constants.mode CORRECT = collections.defaultdict(lambda: 0.) GOLD = collections.defaultdict(lambda: 0.) PRED = collections.defaultdict(lambda: 0.) def splitBars(w): return [q.strip() for q in w.split('|')] # main loop def main(trained_model, testing_file, viterbi,
import pickle import inflect import train_crf as crf from train import load_data import helper import re, pdb, collections import constants import re p = inflect.engine() int2tags = ['TAG'] + constants.int2tags #since the constants file does not include the 'TAG' tag tags2int = constants.tags2int tags = range(len(int2tags)) helper.load_constants() mode = constants.mode # main loop def main(trained_model,testing_file,viterbi,output_tags="output.tag", output_predictions="output.pred"): test_data, identifier = load_data(testing_file) evaluate = True ## extract features if not "crf" in trained_model: if not isinstance(trained_model, list): clf, previous_n, next_n, word_vocab,other_features = pickle.load( open( trained_model, "rb" ) ) else: clf, previous_n, next_n, word_vocab,other_features = trained_model