def dense2(fof, outfof): fnames = util.getfnames(fof) isdir = os.path.isdir(fof) random.seed(1234567) for fname in fnames: outlines = [] with codecs.open(fname, "r", "utf8") as f: lines = f.readlines() for line in lines: sline = line.split("\t") if len(sline) > 5: tag = sline[0] if tag == "O": # maybe add, maybe don't if random.random() < 0.02: outlines.append(line) else: outlines.append("\n") else: outlines.append(line) else: outlines.append(line) if isdir: outfname = outfof + "/" + os.path.basename(fname) else: outfname = outfof # if outlines isn't empty... with codecs.open(outfname, "w", "utf8") as out: for line in outlines: out.write(line)
def getstats(fof): """ Given a directory dr containing CoNLL files, collect tag and token statistics over the corpus. """ if "," in fof: fofs = fof.split(",") fnames = [] for f in fofs: fnames.extend(getfnames(f)) else: fnames = getfnames(fof) toks = 0 totaltags = 0 totalstarttags = 0 tags = defaultdict(int) for fname in fnames: with open(fname) as f: for line in f: if len(line.strip()) == 0: continue if "DOCSTART" in line: continue toks += 1 tag = line.strip().split()[0] if tag != "O": tags[tag] += 1 totaltags += 1 if tag[0] == "B": totalstarttags += 1 print("Documents: {0}".format(len(fnames))) print("Tokens: {0}".format(toks)) print("Total nes: {0}".format(totalstarttags)) print("Total ne tokens: {0}".format(totaltags)) print("NE percentage: {0}%".format(round(100 * totaltags / float(toks), 2))) print("Tag dict:") tags = sorted(tags.items()) for t, v in tags: print(t, ":", v)
def getstats(fof): """ Given a directory dr containing CoNLL files, collect tag and token statistics over the corpus. """ if "," in fof: fofs = fof.split(",") fnames = [] for f in fofs: fnames.extend(getfnames(f)) else: fnames = getfnames(fof) toks = 0 totaltags = 0 totalstarttags = 0 tags = defaultdict(int) for fname in fnames: with open(fname) as f: for line in f: if len(line.strip()) == 0: continue if "DOCSTART" in line: continue toks += 1 tag = line.strip().split()[0] if tag != "O": tags[tag] += 1 totaltags += 1 if tag[0] == "B": totalstarttags += 1 print("Documents: {0}".format(len(fnames))) print("Tokens: {0}".format(toks)) print("Total nes: {0}".format(totalstarttags)) print("Total ne tokens: {0}".format(totaltags)) print("NE percentage: {0}%".format(round(100*totaltags / float(toks), 2))) print("Tag dict:") tags = sorted(tags.items()) for t,v in tags: print(t,":",v)
def convert(fof, outfof=""): """ Given a file or folder of text files, this will convert to conll files.""" fnames = getfnames(fof) for fname in fnames: with codecs.open(fname,"r", "utf8") as f: lines = f.readlines() fnonly = os.path.basename(fname) outname = os.path.join(outfof, fnonly + ".conll") out = codecs.open(outname, "w", "utf8") i = 1 for line in lines: #if line[0] == "#": # out.write(line) # continue # if you come across an empty line, don't write anything. if len(line.strip()) == 0: continue sline = line.split() for word in sline: if len(word) == 0: continue while len(word) > 0 and word[0] in punc: out.write("\t".join(["O", "0", str(i), "x", "x", word[0], "x", "x", "0\n"])) word = word[1:] i += 1 after = [] while len(word) > 0 and word[-1] in punc: # insert so the order is correct. after.insert(0, word[-1]) word = word[:-1] if len(word) > 0: # Now word is pure, unfiltered... out.write("\t".join(["O", "0", str(i), "x", "x", word, "x", "x", "0\n"])) for pn in after: i += 1 out.write("\t".join(["O", "0", str(i), "x", "x", pn, "x", "x", "0\n"])) i += 1 out.write("\n") out.close() print("Wrote to: ", outname)
def convert(fof, outfof="", outformat="iob2"): """ Given a file or folder of text files, this will convert to conll files. Assume that the input is always iob2. """ fnames = getfnames(fof) for fname in fnames: with codecs.open(fname, "r", "utf8") as f: lines = f.readlines() fnonly = os.path.basename(fname) outname = os.path.join(outfof, fnonly + ".conll") out = codecs.open(outname, "w", "utf8") i = 1 last = "" for line in lines: line = line.strip() if len(line.strip()) == 0: out.write(line + "\n") last = "" else: sline = line.split("\t") if len(sline) == 1: sline.insert(0, "@@@") word, tag = sline if outformat == "iob1": prefix = tag[0] label = tag[2:] if tag != "O": if last == label and prefix == "B": prefix = "B" else: prefix = "I" tag = "{}-{}".format(prefix, label) last = label #out.write("\t".join([tag.upper(), "0", str(i), "x", "x", word, "x", "x", "0\n"])) out.write(" ".join([word, "O", "O", tag.upper() + "\n"])) i += 1 out.close() print("Wrote to: ", outname)
def func(folder, outfolder, labels=[], words=[]): fnames = getfnames(folder) isfolder = os.path.isdir(folder) labdict = {} for lab in labels: ls= lab.split(":") if len(ls) == 1: labdict[ls] = "O" else: labdict[ls[0]] = ls[1] for fname in fnames: with open(fname) as f: lines = f.readlines() outlines = [] for line in lines: sline = line.split("\t") if len(sline) > 5: if labels == [] and words == []: sline[0] = "O" elif len(labels) > 0 and words == []: if sline[0] in labdict: sline[0] = labdict[sline[0]] elif len(labels) == 0 and len(words) > 0: if wordstart(sline[5], words): sline[0] = "O" elif len(labels) > 0 and len(words) > 0: if sline[0] in labdict and wordstart(sline[5], words): sline[0] = "O" outlines.append("\t".join(sline)) if isfolder: fnonly = os.path.basename(fname) outpath = outfolder + "/"+ fnonly else: outpath = outfolder with open(outpath, "w") as out: for line in outlines: out.write(line);
def preparewords(fof, outfile): """ puts all words in one big line. Typically used for training a word2vec model TODO: consider cleaning punctuation. """ fnames = getfnames(fof) print("writing words to", outfile) with codecs.open(outfile, "w", "utf-8") as out: for fname in fnames: with codecs.open(fname, "r", "utf-8") as f: lines = f.readlines() for line in lines: sline = line.split("\t") if len(sline) > 5: out.write(sline[5] + " ")
def preparesents(fof, outfile): """ Reads CoNLL files to create a file to be used for language model training. One sentence per line. """ fnames = getfnames(fof) print("writing sentences to", outfile) with codecs.open(outfile, "w", "utf-8") as out: for fname in fnames: with codecs.open(fname, "r", "utf-8") as f: lines = f.readlines() currsent = "" for line in lines: sline = line.split("\t") if len(sline) > 5: currsent += sline[5] + " " else: out.write(currsent.strip() + "\n") currsent = ""
def densify(fof, outfof, window=-1): """ Given a file, this will densify the file. That is, keep only tokens within a window of labels. By default (window=-1), this does nothing.""" fnames = util.getfnames(fof) isdir = os.path.isdir(fof) for fname in fnames: outlines = set() i = 0 with codecs.open(fname, "r", "utf8") as f: lines = f.readlines() if window == -1: outwrite = lines containscontent = True else: for line in lines: sline = line.split("\t") if len(sline) > 5: tag = sline[0] if tag != "O": # this is a label. # add w before and w after. # don't even need to worry about range checking! for j in range(i, i - window - 1, -1): if j < 0 or len(lines[j].strip()) == 0: break outlines.add(j) for j in range(i, i + window + 1): if j >= len(lines) or len(lines[j].strip()) == 0: break outlines.add(j) else: outlines.add(i) i += 1 # conflate empty lines. outwrite = [] lastlinewasempty = False containscontent = False for i, line in enumerate(lines): if i in outlines: isempty = len(line.strip()) == 0 if isempty: if not lastlinewasempty: lastlinewasempty = True outwrite.append(line) else: containscontent = True outwrite.append(line) lastlinewasempty = False if isdir: outfname = outfof + "/" + os.path.basename(fname) else: outfname = outfof # if outlines isn't empty... if containscontent: with codecs.open(outfname, "w", "utf8") as out: for line in outwrite: out.write(line)
def getstats(folders): if len(folders) > 2: print(">2 folders is not supported. Will only operate on {} and {}". format(folders[0], folders[1])) # this will only ever have two elements namedicts = [] tokendicts = [] tagdicts = [] for folder in folders: files = util.getfnames(folder) names = defaultdict(int) tags = defaultdict(int) tokens = defaultdict(int) weights = 0 nametokens = 0 sentences = 0 for f in files: cdoc = readconll(f) sentences += len(cdoc.sentenceends) for t in cdoc.tokens: if t.weight > 0: tokens[t.s] += t.weight weights += t.weight for c in cdoc.getconstituents(): # assume that all tokens in a constituent share the same weight. for t in c.tokens: if t.weight > 0: nametokens += t.weight names[c.label + "\t" + c.surf()] += t.weight tags[c.label] += t.weight namedicts.append(names) tokendicts.append(tokens) tagdicts.append(tags) numtokens = sum(tokens.values()) numnames = sum(names.values()) uniqnames = len(names.keys()) try: reps = numnames / float(uniqnames) uniqs = uniqnames / float(numnames) tagpercentage = nametokens / numtokens weightedtagpercentage = nametokens / weights except ZeroDivisionError: reps = 0.0 uniqs = 0.0 tagpercentage = 0. weightedtagpercentage = 0. print("{}: {}".format("Folder", folder)) print(" {:<20}{:>10}".format("Documents", len(files))) print(" {:<20}{:>10,}".format("Num tokens", numtokens)) print(" {:<20}{:>10,}".format("Num unique tokens", len(tokens.keys()))) print(" {:<20}{:>10,}".format("Num sentences", sentences)) print(" {:<20}{:>10,}".format("Num names", numnames)) print(" {:<20}{:>10,}".format("Num name tokens", nametokens)) print(" {:<20}{:>10,}".format("Num unique name tokens", uniqnames)) print(" {:<20}{:>10.2}".format("Avg num repetitions", reps)) print(" {:<20}{:>10.2}".format("Unique / total", uniqs)) print(" {:<20}{:>10.2%}".format("Tag %", tagpercentage)) print(" {:<20}{:>10.2%}".format("Weighted Tag %", weightedtagpercentage)) print(" Tag dict") for t in sorted(tags): print(" {}: {} ({:.2%})".format(t, tags[t], tags[t] / float(numnames))) if len(namedicts) == 2: print("Comparison:") if tagdicts[0].keys() != tagdicts[1].keys(): print(" ***Mismatching tag set!***") namecos, nameinter, nameweight = cos(namedicts[0], namedicts[1]) vocabcos, vocabinter, vocabweight = cos(tokendicts[0], tokendicts[1]) vocabkl, inter = KL(tokendicts[0], tokendicts[1]) tagcos, taginter, tagweight = cos(tagdicts[0], tagdicts[1]) print(" Names cos sim: {}".format(namecos)) print(" Vocab cos sim: {}".format(vocabcos)) print(" Vocab overlap: {}".format(vocabinter)) print(" Vocab KL divg: {}".format(vocabkl)) print(" Tag cos sim: {}".format(tagcos)) numtestnames = sum(namedicts[1].values()) print(" %names in test seen in training: {}".format( nameinter / float(numtestnames))) numtesttokens = sum(tokendicts[1].values()) print(" %tokens in test seen in training: {}".format( vocabinter / float(numtesttokens)))
def func(folder, outfolder, mention): random.seed(1234567) fnames = getfnames(folder) isfolder = os.path.isdir(folder) # word frequencies. wfreq = defaultdict(int) weightmass = defaultdict(int) # make a first pass to gather word # frequencies and distances from entities. dists = defaultdict(lambda: defaultdict(int)) # is entity? def isent(s): return s != "O" for fname in fnames: # this measures the distance from token i to the nearest # named entity lastindex = 0 # boolean for if cond: # the previous entity boundary was a sentence prevempty = True with open(fname) as f: lines = f.readlines() for i, line in enumerate(lines): sline = line.split("\t") if len(sline) > 5: wfreq[sline[5]] += 1 ent = isent(sline[0]) empty = len(sline) < 5 if ent or empty: dists[fname][i] = 0 # when you see this, then go over all tokens since then... dd = i - lastindex startbias = 0 endbias = 0 if empty and prevempty: # these are just large nubmers... startbias = 10000 endbias = 10000 elif empty: endbias = 10000 elif prevempty: startbias = 10000 for j in range(lastindex+1, i): disttolast = j - lastindex disttonext = dd - disttolast dists[fname][j] = min(disttolast + startbias, disttonext + endbias) lastindex = i prevempty = empty # normalize the word frequencies mx = max(wfreq.values()) for w in wfreq: wfreq[w] /= mx for fname in fnames: with open(fname) as f: lines = f.readlines() outlines = [] for i, line in enumerate(lines): sline = line.strip().split("\t") if len(sline) > 5: sline[7] = str(dists[fname][i]) if (sline[5] in punc or isnum(sline[5])): theta = 1.0 sline[6] = 0.0 if random.random() > theta else 1.0 elif sline[0] == "O": # probability of including #theta = wfreq[sline[5]] theta = 0.01 sline[6] = 0.0 if random.random() > theta else 1.0 #sline[6] = 0.0 #sline[6] = windowed(dists[fname][i]) #sline[6] = 1.0 #sline[6] = softwindowed(dists[fname][i]) #sline[6] = #sline[6] = rand() else: # probabil3y of including theta = 0.3 sline[6] = 0.0 if random.random() > theta else 1.0 weightmass[sline[0]] += sline[6] sline[6] = str(sline[6]) sline[8] = str(wfreq[sline[5]]) outlines.append("\t".join(sline) + "\n") else: outlines.append("\n") if isfolder: fnonly = os.path.basename(fname) outpath = outfolder + "/" + fnonly else: outpath = outfolder with open(outpath, "w") as out: for line in outlines: out.write(line) for tag in sorted(weightmass): print("{}: {}".format(tag, weightmass[tag])) tags = 0 for k in weightmass: if k == "O": continue tags += weightmass[k] print("Final ratio R: {:.2%}".format(tags / sum(weightmass.values())))
def func(folder, outfolder, precision, recall): # This is v1 seed: random.seed(1234567) # v2 seed: #random.seed(4343) fnames = getfnames(folder) # this contains the sets of constituents and frequencies d = defaultdict(int) namesdocs = [] total = 0 labels = list() for fname in fnames: cdoc = readconll(fname) cons = cdoc.getconstituents() namesdocs.append((fname, cdoc)) for c in cons: d[c.surf()] += 1 total += 1 labels.append(c.label) labels = list(labels) # build the set of names we will keep... goal = recall * total currnum = 0 activecons = set() # impose ordering ditems = sorted(d.items()) # make it random, but consistent. random.shuffle(ditems) for c, freq in ditems: activecons.add(c) currnum += freq if currnum >= goal: break print("Writing to {}".format(outfolder)) for fname, cdoc in namesdocs: cons = cdoc.getconstituents() numpos = len(cons) for con in cons: # discard all the names we don't keep. if con.surf() not in activecons: # print(con) cdoc.removeconstituent(con) numpos -= 1 badspanstoadd = math.ceil(numpos / precision - numpos) for _ in range(badspanstoadd): start = random.randrange(0, len(cdoc.tokens) - 5) length = random.randrange(1, 3) end = start + length randlabel = random.choice(labels) newcon = Constituent(randlabel, cdoc.tokens[start:end], (start, end)) cdoc.addconstituent(newcon) with open(outfolder + "/" + os.path.basename(fname), "w") as out: cdoc.write(out)
def func(folder, outfolder, mention): random.seed(1234567) fnames = getfnames(folder) isfolder = os.path.isdir(folder) # word frequencies. wfreq = defaultdict(int) weightmass = defaultdict(int) # make a first pass to gather word # frequencies and distances from entities. dists = defaultdict(lambda: defaultdict(int)) # is entity? def isent(s): return s != "O" for fname in fnames: # this measures the distance from token i to the nearest # named entity lastindex = 0 # boolean for if cond: # the previous entity boundary was a sentence prevempty = True with open(fname) as f: lines = f.readlines() for i, line in enumerate(lines): sline = line.split("\t") if len(sline) > 5: wfreq[sline[5]] += 1 ent = isent(sline[0]) empty = len(sline) < 5 if ent or empty: dists[fname][i] = 0 # when you see this, then go over all tokens since then... dd = i - lastindex startbias = 0 endbias = 0 if empty and prevempty: # these are just large nubmers... startbias = 10000 endbias = 10000 elif empty: endbias = 10000 elif prevempty: startbias = 10000 for j in range(lastindex + 1, i): disttolast = j - lastindex disttonext = dd - disttolast dists[fname][j] = min(disttolast + startbias, disttonext + endbias) lastindex = i prevempty = empty # normalize the word frequencies mx = max(wfreq.values()) for w in wfreq: wfreq[w] /= mx for fname in fnames: with open(fname) as f: lines = f.readlines() outlines = [] for i, line in enumerate(lines): sline = line.strip().split("\t") if len(sline) > 5: sline[7] = str(dists[fname][i]) if (sline[5] in punc or isnum(sline[5])): theta = 1.0 sline[6] = 0.0 if random.random() > theta else 1.0 elif sline[0] == "O": # probability of including #theta = wfreq[sline[5]] theta = 0.01 sline[6] = 0.0 if random.random() > theta else 1.0 #sline[6] = 0.0 #sline[6] = windowed(dists[fname][i]) #sline[6] = 1.0 #sline[6] = softwindowed(dists[fname][i]) #sline[6] = #sline[6] = rand() else: # probabil3y of including theta = 0.3 sline[6] = 0.0 if random.random() > theta else 1.0 weightmass[sline[0]] += sline[6] sline[6] = str(sline[6]) sline[8] = str(wfreq[sline[5]]) outlines.append("\t".join(sline) + "\n") else: outlines.append("\n") if isfolder: fnonly = os.path.basename(fname) outpath = outfolder + "/" + fnonly else: outpath = outfolder with open(outpath, "w") as out: for line in outlines: out.write(line) for tag in sorted(weightmass): print("{}: {}".format(tag, weightmass[tag])) tags = 0 for k in weightmass: if k == "O": continue tags += weightmass[k] print("Final ratio R: {:.2%}".format(tags / sum(weightmass.values())))
def densify(fof, outfof, window=-1): """ Given a file, this will densify the file. That is, keep only tokens within a window of labels. By default (window=-1), this does nothing.""" fnames = util.getfnames(fof) isdir = os.path.isdir(fof) for fname in fnames: outlines = set() i = 0 with codecs.open(fname, "r", "utf8") as f: lines = f.readlines() if window == -1: outwrite = lines containscontent = True else: for line in lines: sline = line.split("\t") if len(sline) > 5: tag = sline[0] if tag != "O": # this is a label. # add w before and w after. # don't even need to worry about range checking! for j in range(i, i-window-1, -1): if j < 0 or len(lines[j].strip()) == 0: break outlines.add(j) for j in range(i, i+window+1): if j >= len(lines) or len(lines[j].strip()) == 0: break outlines.add(j) else: outlines.add(i) i += 1 # conflate empty lines. outwrite = [] lastlinewasempty = False containscontent = False for i,line in enumerate(lines): if i in outlines: isempty = len(line.strip()) == 0 if isempty: if not lastlinewasempty: lastlinewasempty = True outwrite.append(line); else: containscontent = True outwrite.append(line); lastlinewasempty = False if isdir: outfname = outfof + "/" + os.path.basename(fname) else: outfname = outfof # if outlines isn't empty... if containscontent: with codecs.open(outfname , "w", "utf8") as out: for line in outwrite: out.write(line)
def func(folder, outfolder, mention, methods, defaultweight): """ methods is a list. Can contain: punc, window, softwindow, freq, uniform, random """ random.seed(1234567) fnames = getfnames(folder) isfolder = os.path.isdir(folder) # word frequencies. wfreq = defaultdict(int) # make a first pass to gather word # frequencies and distances from entities. dists = defaultdict(lambda: defaultdict(int)) # is entity? def isent(s): return s != "O" for fname in fnames: # this measures the distance from token i to the nearest # named entity lastindex = 0 # boolean for if cond: # the previous entity boundary was a sentence prevempty = True with open(fname) as f: lines = f.readlines() for i, line in enumerate(lines): sline = line.split("\t") if len(sline) > 5: wfreq[sline[5]] += 1 ent = isent(sline[0]) empty = len(sline) < 5 if ent or empty: dists[fname][i] = 0 # when you see this, then go over all tokens since then... dd = i - lastindex startbias = 0 endbias = 0 if empty and prevempty: # these are just large nubmers... startbias = 10000 endbias = 10000 elif empty: endbias = 10000 elif prevempty: startbias = 10000 for j in range(lastindex+1, i): disttolast = j - lastindex disttonext = dd - disttolast dists[fname][j] = min(disttolast + startbias, disttonext + endbias) lastindex = i prevempty = empty # normalize the word frequencies mx = max(wfreq.values()) for w in wfreq: wfreq[w] /= mx for method in methods: if len(method) > 0 and method not in allowedmethods: print("Warning: {} not a supported method. Ignoring.".format(method)) for fname in fnames: with open(fname) as f: lines = f.readlines() outlines = [] for i, line in enumerate(lines): sline = line.strip().split("\t") if len(sline) > 5: sline[7] = str(dists[fname][i]) if mention: # in this case, we don't want weights. sline[6] = 1.0 if sline[0] != "O": sline[0] = "B-MNT" elif sline[0] == "O": if defaultweight is not None: sline[6] = defaultweight try: sline[6] = float(sline[6]) except Exception: sline[6] = 0.0 if "random" in methods: if random.random() < 0.25: sline[6] += 1.0 # These all give weights to all methods. if "softwindow" in methods: sline[6] += softwindow(dists[fname][i]) if "freq" in methods: sline[6] += freq(wfreq[sline[5]]) if "uniform" in methods: sline[6] += uniform() # The following give weights to just a few. if "punc" in methods: if sline[5] in punc or isnum(sline[5]): sline[6] += 1.0 if "window" in methods: if dists[fname][i] <= 1: sline[6] += 1.0 if sline[6] == "x": sline[6] = 0.0 else: sline[6] = 1.0 sline[6] = str(sline[6]) sline[8] = str(wfreq[sline[5]]) outlines.append("\t".join(sline) + "\n") else: outlines.append("\n") if isfolder: fnonly = os.path.basename(fname) outpath = outfolder + "/" + fnonly else: outpath = outfolder with open(outpath, "w") as out: for line in outlines: out.write(line)
def func(fof1, fof2, ignore=False): print("THIS ONLY RETURNS TOKEN LEVEL") fnames1 = sorted(getfnames(fof1)) fnames2 = sorted(getfnames(fof2)) labels = set() gold = [] pred = [] for f1, f2 in zip(fnames1, fnames2): print(f1, f2) try: with open(f1) as f: lines = f.readlines() with open(f2) as f: lines2 = f.readlines() except IOError as e: print(e) continue i = 0 j = 0 total = 0 while i < len(lines) and j < len(lines2): sline = lines[i].split("\t") sline2 = lines2[j].split("\t") try: while "-DOCSTART-" in lines[i] or lines[i].strip() == "": i += 1 sline = lines[i].split("\t") while "-DOCSTART-" in lines2[j] or lines2[j].strip() == "": j += 1 sline2 = lines2[j].split("\t") except IndexError: break if len(sline) < 5: continue predweight = sline2[6] total += 1 if ignore and float(predweight) == 0.0: pass else: gold.append(sline[0]) pred.append(sline2[0]) if sline[0] != "O": labels.add(sline[0]) #if sline[5] != sline2[5]: # print("mismatching words!") # print(sline[5]) # print(sline2[5]) # exit() i += 1 j += 1 labels = list(labels) p = precision_score(gold, pred, labels=labels, average="weighted") r = recall_score(gold, pred, labels=list(labels), average="weighted") f1 = f1_score(gold, pred, labels=list(labels), average="weighted") print("Scoring: {} lines out of {}".format(len(pred), total)) print("SCORES: {} {} {}".format(p, r, f1))
def getstats(folders): if len(folders) > 2: print(">2 folders is not supported. Will only operate on {} and {}" .format(folders[0], folders[1])) # this will only ever have two elements namedicts = [] tokendicts = [] tagdicts = [] for folder in folders: files = util.getfnames(folder) names = defaultdict(int) tags = defaultdict(int) tokens = defaultdict(int) weights = 0 nametokens = 0 sentences = 0 for f in files: cdoc = readconll(f) sentences += len(cdoc.sentenceends) for t in cdoc.tokens: if t.weight > 0: tokens[t.s] += t.weight weights += t.weight for c in cdoc.getconstituents(): # assume that all tokens in a constituent share the same weight. for t in c.tokens: if t.weight > 0: nametokens += t.weight names[c.label + "\t" + c.surf()] += t.weight tags[c.label] += t.weight namedicts.append(names) tokendicts.append(tokens) tagdicts.append(tags) numtokens = sum(tokens.values()) numnames = sum(names.values()) uniqnames = len(names.keys()) try: reps = numnames / float(uniqnames) uniqs = uniqnames / float(numnames) tagpercentage = nametokens / numtokens weightedtagpercentage = nametokens / weights except ZeroDivisionError: reps = 0.0 uniqs = 0.0 tagpercentage = 0. weightedtagpercentage = 0. print("{}: {}".format("Folder", folder)) print(" {:<20}{:>10}".format("Documents", len(files))) print(" {:<20}{:>10,}".format("Num tokens", numtokens)) print(" {:<20}{:>10,}".format("Num unique tokens", len(tokens.keys()))) print(" {:<20}{:>10,}".format("Num sentences", sentences)) print(" {:<20}{:>10,}".format("Num names", numnames)) print(" {:<20}{:>10,}".format("Num name tokens", nametokens)) print(" {:<20}{:>10,}".format("Num unique name tokens", uniqnames)) print(" {:<20}{:>10.2}".format("Avg num repetitions", reps)) print(" {:<20}{:>10.2}".format("Unique / total", uniqs)) print(" {:<20}{:>10.2%}".format("Tag %", tagpercentage)) print(" {:<20}{:>10.2%}".format("Weighted Tag %", weightedtagpercentage)) print(" Tag dict") for t in sorted(tags): print(" {}: {} ({:.2%})" .format(t, tags[t], tags[t] / float(numnames))) if len(namedicts) == 2: print("Comparison:") if tagdicts[0].keys() != tagdicts[1].keys(): print(" ***Mismatching tag set!***") namecos, nameinter, nameweight = cos(namedicts[0], namedicts[1]) vocabcos, vocabinter, vocabweight = cos(tokendicts[0], tokendicts[1]) vocabkl, inter = KL(tokendicts[0], tokendicts[1]) tagcos, taginter, tagweight = cos(tagdicts[0], tagdicts[1]) print(" Names cos sim: {}".format(namecos)) print(" Vocab cos sim: {}".format(vocabcos)) print(" Vocab overlap: {}".format(vocabinter)) print(" Vocab KL divg: {}".format(vocabkl)) print(" Tag cos sim: {}".format(tagcos)) numtestnames = sum(namedicts[1].values()) print(" %names in test seen in training: {}" .format(nameinter / float(numtestnames))) numtesttokens = sum(tokendicts[1].values()) print(" %tokens in test seen in training: {}" .format(vocabinter / float(numtesttokens)))
def func(folder, outfolder, mention, methods, defaultweight): """ methods is a list. Can contain: punc, window, softwindow, freq, uniform, random """ random.seed(1234567) fnames = getfnames(folder) isfolder = os.path.isdir(folder) # word frequencies. wfreq = defaultdict(int) # make a first pass to gather word # frequencies and distances from entities. dists = defaultdict(lambda: defaultdict(int)) # is entity? def isent(s): return s != "O" for fname in fnames: # this measures the distance from token i to the nearest # named entity lastindex = 0 # boolean for if cond: # the previous entity boundary was a sentence prevempty = True with open(fname) as f: lines = f.readlines() for i, line in enumerate(lines): sline = line.split("\t") if len(sline) > 5: wfreq[sline[5]] += 1 ent = isent(sline[0]) empty = len(sline) < 5 if ent or empty: dists[fname][i] = 0 # when you see this, then go over all tokens since then... dd = i - lastindex startbias = 0 endbias = 0 if empty and prevempty: # these are just large nubmers... startbias = 10000 endbias = 10000 elif empty: endbias = 10000 elif prevempty: startbias = 10000 for j in range(lastindex + 1, i): disttolast = j - lastindex disttonext = dd - disttolast dists[fname][j] = min(disttolast + startbias, disttonext + endbias) lastindex = i prevempty = empty # normalize the word frequencies mx = max(wfreq.values()) for w in wfreq: wfreq[w] /= mx for method in methods: if len(method) > 0 and method not in allowedmethods: print( "Warning: {} not a supported method. Ignoring.".format(method)) for fname in fnames: with open(fname) as f: lines = f.readlines() outlines = [] for i, line in enumerate(lines): sline = line.strip().split("\t") if len(sline) > 5: sline[7] = str(dists[fname][i]) if mention: # in this case, we don't want weights. sline[6] = 1.0 if sline[0] != "O": sline[0] = "B-MNT" elif sline[0] == "O": if defaultweight is not None: sline[6] = defaultweight try: sline[6] = float(sline[6]) except Exception: sline[6] = 0.0 if "random" in methods: if random.random() < 0.25: sline[6] += 1.0 # These all give weights to all methods. if "softwindow" in methods: sline[6] += softwindow(dists[fname][i]) if "freq" in methods: sline[6] += freq(wfreq[sline[5]]) if "uniform" in methods: sline[6] += uniform() # The following give weights to just a few. if "punc" in methods: if sline[5] in punc or isnum(sline[5]): sline[6] += 1.0 if "window" in methods: if dists[fname][i] <= 1: sline[6] += 1.0 if sline[6] == "x": sline[6] = 0.0 else: sline[6] = 1.0 sline[6] = str(sline[6]) sline[8] = str(wfreq[sline[5]]) outlines.append("\t".join(sline) + "\n") else: outlines.append("\n") if isfolder: fnonly = os.path.basename(fname) outpath = outfolder + "/" + fnonly else: outpath = outfolder with open(outpath, "w") as out: for line in outlines: out.write(line)
def func(folder, outfolder, precision, recall): # This is v1 seed: random.seed(1234567) # v2 seed: #random.seed(4343) fnames = getfnames(folder) # this contains the sets of constituents and frequencies d = defaultdict(int) namesdocs = [] total = 0 labels = list() for fname in fnames: cdoc = readconll(fname) cons = cdoc.getconstituents() namesdocs.append((fname,cdoc)) for c in cons: d[c.surf()] += 1 total += 1 labels.append(c.label) labels = list(labels) # build the set of names we will keep... goal = recall * total currnum = 0 activecons = set() # impose ordering ditems = sorted(d.items()) # make it random, but consistent. random.shuffle(ditems) for c, freq in ditems: activecons.add(c) currnum += freq if currnum >= goal: break print("Writing to {}".format(outfolder)) for fname,cdoc in namesdocs: cons = cdoc.getconstituents() numpos = len(cons) for con in cons: # discard all the names we don't keep. if con.surf() not in activecons: # print(con) cdoc.removeconstituent(con) numpos -= 1 badspanstoadd = math.ceil(numpos / precision - numpos) for _ in range(badspanstoadd): start = random.randrange(0, len(cdoc.tokens)-5) length = random.randrange(1, 3) end = start + length randlabel = random.choice(labels) newcon = Constituent(randlabel, cdoc.tokens[start:end], (start, end)) cdoc.addconstituent(newcon) with open(outfolder + "/" + os.path.basename(fname), "w") as out: cdoc.write(out)
def func(fof1, fof2, ignore=False): print("THIS ONLY RETURNS TOKEN LEVEL") fnames1 = sorted(getfnames(fof1)) fnames2 = sorted(getfnames(fof2)) labels = set() gold = [] pred = [] for f1, f2 in zip(fnames1, fnames2): print(f1, f2) try: with open(f1) as f: lines = f.readlines() with open(f2) as f: lines2 = f.readlines() except IOError as e: print(e) continue i = 0 j = 0 total = 0 while i < len(lines) and j < len(lines2): sline = lines[i].split("\t") sline2 = lines2[j].split("\t") try: while "-DOCSTART-" in lines[i] or lines[i].strip() == "": i += 1 sline = lines[i].split("\t") while "-DOCSTART-" in lines2[j] or lines2[j].strip() == "": j += 1 sline2 = lines2[j].split("\t") except IndexError: break if len(sline) < 5: continue try: predweight = sline2[6] except Exception: predweight = 1.0 total += 1 if ignore and float(predweight) == 0.0: pass else: gold.append(sline[0]) pred.append(sline2[0]) if sline[0] != "O": labels.add(sline[0]) #if sline[5] != sline2[5]: # print("mismatching words!") # print(sline[5]) # print(sline2[5]) # exit() i += 1 j += 1 labels = list(labels) p = precision_score(gold, pred, labels=labels, average="weighted") r = recall_score(gold, pred, labels=list(labels), average="weighted") f1 = f1_score(gold, pred, labels=list(labels), average="weighted") print("Scoring: {} lines out of {}".format(len(pred), total)) print("SCORES: {} {} {}".format(p,r,f1))