def final_scan(exdeflike, indeflike, multi=True, lowercase=False, externalPOS=False, outkey="default"): """train all of the given training data and then test it on the supplied test records. make predictions for NE for each token, then print them out in the format required""" """ #load exdef and indef with open('finalexdef.pickle', 'rb') as infile: exdeflike = pickle.load(infile) with open('finalindef.pickle', 'rb') as infile2: indeflike = pickle.load(infile2) """ #load the test data test_file = '../data/emerging.dev.conll' with open(test_file, 'r') as f3: records = re.split("\n[\t]?\n", f3.read().strip()) numrecs = len(records) os.system("mkdir -p ../data/predictions/" + outkey) #analyze the test data # threshold = [0.138, 0.13] #this is the threshold we found to give the best F1 score #on the training data, using n-fold validation tstarts = { "location": 0.5, "group": 0.5, "product": 0.5, "creative-work": 0.5, "person": 0.5, "corporation": 0.5 } tdiffs = range(-4, 5) # [d for d in range(-49,50)] ## diffs = [-0.49--0.49] allresults = defaultdict(list) ## begin rounds loop here for rnd in range(1, 4): ##range(1,2): ## print "working on rnd " + str(rnd) print "here are the starting thresholds: " for NEtype in tstarts: print NEtype, tstarts[NEtype] print results = defaultdict(list) tdiffs = [tdiff / 10 for tdiff in tdiffs] # [tdiff/100 for tdiff in tdiffs] [-0.0049--0.0049] NEthreshs = { NEtype: [tstarts[NEtype] + tdiff for tdiff in tdiffs] # [t/1000 for t in range(1001)] for NEtype in tstarts } fs = {} for NEtype in NEthreshs: for t in NEthreshs[NEtype]: fkey = str(t) + "-" + NEtype threshfile = re.sub("/data/", "/data/predictions/" + outkey + "/", test_file + "-" + fkey + ".prediction") fs[fkey] = [open(threshfile, 'w'), threshfile] fs[fkey][0].close() numdone = 0 for record in records: print str( 100 * numdone / numrecs) + "% done with round " + str(rnd) numdone += 1 if record: #avoid empty strings data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) uppertokens = list(tokens) if lowercase: tokens = [token.lower() for token in tokens] for NEtype in NEthreshs: exavedeflike, inavedeflike = NER.test( tokens, exdeflike, indeflike, multi, NEtype, externalPOS=externalPOS, uppertokens=uppertokens) for t in NEthreshs[NEtype]: #keep track of the NE assignments #for each token with tuples if lowercase: assignments = [[token, 'O'] for token in uppertokens if token] else: assignments = [[token, 'O'] for token in tokens if token] fkey = str(t) + "-" + NEtype #find the NEs using the _LFD_ function as before for indices in NER.LFD(tokens, exavedeflike, inavedeflike, [1.1, t]): # print t, [tokens[ix] for ix in indices] # innums = [inavedeflike[ix][1] for ix in indices if ix != indices[0]] # innums.append(inavedeflike[indices[0]][0]) # print "internal", NER.harmonic_mean(innums) # raw_input() n = 0 for index in indices: if n == 0: assignments[index][1] = 'B-' + NEtype else: assignments[index][1] = 'I-' + NEtype n += 1 #keep track of position in NE ## write out according to file handles, here fs[fkey][0] = open(fs[fkey][1], "a") for i, assignment in enumerate(assignments): fs[fkey][0].writelines("\t".join( [assignment[0], tags[i], assignment[1]]) + "\n") fs[fkey][0].writelines("\n") fs[fkey][0].close() ## evaluate all thresholds and all NE types for the best of the round for fkey in fs: ## fs[fkey][0].close() NEtype = "-".join(re.split("-", fkey)[1:]) t = float(re.split("-", fkey)[0]) filename = fs[fkey][1] try: results[NEtype].append((map( float, re.split("\;", [ re.sub("[^0-9\.\;]+", "", re.sub("\d+$|\d\:", "", r)) for r in re.split( "\n", subprocess.check_output( "python2 ../data/wnuteval.py " + filename, shell=True)) if re.search(NEtype, r) ][0])), t)) except: results[NEtype].append(([0., 0., 0.], t)) allresults[NEtype].append(tuple(results[NEtype][-1])) ## store the best of this round as tstarts for NEtype in results: tstarts[NEtype] = max(results[NEtype], key=lambda x: x[0][2])[1] print "here are the end-of-round thresholds: " for NEtype in tstarts: print NEtype, tstarts[NEtype] print with open("../data/predictions/" + outkey + "/allresults.json", "w") as f: f.writelines(json.dumps([tstarts, allresults])) return tstarts, allresults
def final_analysis( exdeflikefile, indeflikefile, multi=True, lowercase=True, externalPOS=True, dev=True, thresholds={ "location": 0.292, "group": 0.09, "product": 0.131, "creative-work": 1.1, "person": 0.202, "corporation": 1.1 }): #load exdef and indef with open(exdeflikefile) as f: exdeflike = pickle.load(f) with open(indeflikefile) as f: indeflike = pickle.load(f) """train all of the given training data and then test it on the supplied test records. make predictions for NE for each token, then print them out in the format required""" """ #load exdef and indef with open('finalexdef.pickle', 'rb') as infile: exdeflike = pickle.load(infile) with open('finalindef.pickle', 'rb') as infile2: indeflike = pickle.load(infile2) """ #load the test data if dev: test_file = '../data/emerging.dev.conll' outfilename = "../data/finalpredictions/emerging_" + "_".join( re.split("_", indeflikefile)[1:3]) + ".dev" else: test_file = '../data/emerging.test' outfilename = "../data/finalpredictions/emerging_" + "_".join( re.split("_", indeflikefile)[1:3]) + ".test" with open(test_file, 'r') as f3: records = re.split("\n[\t]?\n", f3.read().strip()) #analyze the test data #on the training data, using n-fold validation # f = open(test_file + ".prediction", 'w') f = open(outfilename, "w") # thresholds = { # ## "location": [0.001, 0.157], # "location": [1.1, 0.157], # ## "group": [0.008, 0.199], # "group": [1.1, 0.199], # "product": [1.1, 0.215], # "creative-work": [1.1,0.499], # ## "person": [0.002, 0.167], # "person": [1.1, 0.167], # "corporation": [1.1, 0.218] # } for record in records: if record: #avoid empty strings if dev: data = [ re.split('\t', d) for d in re.split("\n", record) if len(re.split("\t", d)) == 2 ] tokens, tags = zip(*data) else: tokens = [ re.split('\t', d)[0] for d in re.split("\n", record) if len(re.split("\t", d)) == 1 ] uppertokens = list(tokens) if lowercase: tokens = [token.lower() for token in tokens] #keep track of the NE assignments for each token with tuples if lowercase: assignments = [[token, 'O'] for token in uppertokens if token] else: assignments = [[token, 'O'] for token in tokens if token] ## predictions = {} for NEtype in thresholds: exavedeflike, inavedeflike = NER.test(tokens, exdeflike, indeflike, multi, NEtype, externalPOS=externalPOS, uppertokens=uppertokens) #find the NEs using the _LFD_ function as before for indices in NER.LFD(tokens, exavedeflike, inavedeflike, [1.1, thresholds[NEtype]]): # if exavedeflike[indices] >= thresholds[NEtype][0]: # print NEtype+": ", [tokens[ix] for ix in indices] # print "external", exavedeflike[indices] innums = [ inavedeflike[ix][1] for ix in indices if ix != indices[0] ] innums.append(inavedeflike[indices[0]][0]) ## print "internal", NER.harmonic_mean(innums) predictions[(indices, NEtype)] = [ len(list(indices)), exavedeflike[indices], NER.harmonic_mean(innums) ] ## for indices, NEtype in predictions: thissize = predictions[(indices, NEtype)][0] thislike = predictions[(indices, NEtype)][2] for otherindices, otherNEtype in predictions: thatsize = predictions[(otherindices, otherNEtype)][0] thatlike = predictions[(otherindices, otherNEtype)][2] broken = True for ix in otherindices: if ix in indices: if otherindices[0] < indices[0]: print("precidence, avoided: ", [tokens[ix] for ix in indices], " over ", [tokens[ix] for ix in otherindices]) break elif otherindices[0] == indices[0]: if thatsize > thissize: print("size, avoided: ", [tokens[ix] for ix in indices], " over ", [tokens[ix] for ix in otherindices]) break elif thatlike > thislike: print("likelihood, avoided: " + NEtype, [tokens[ix] for ix in indices ], " over " + otherNEtype, [tokens[ix] for ix in otherindices]) break else: broken = False if broken: break else: print NEtype + ": ", [tokens[ix] for ix in indices] print "internal", thislike ## #assign 'B' to the first, 'I' to the rest n = 0 for index in indices: if n == 0: assignments[index][1] = 'B-' + NEtype else: assignments[index][1] = 'I-' + NEtype n += 1 #keep track of position in NE ## for i, assignment in enumerate(assignments): if dev: f.writelines( "\t".join([assignment[0], tags[i], assignment[1]]) + "\n") else: f.writelines("\t".join([assignment[0], assignment[1]]) + "\n") f.writelines("\n")