def conllup_to_starsem(fname, sherlock_train="data/sherlock/cdt.conllup", semcue=False): sentences = cd.read_col_data(fname) train = cd.read_col_data(sherlock_train) _vocabs = vcb.make_vocabs(train, 0) vocabs = vcb.Vocabs(*_vocabs) w2i = vocabs.scoperels.w2i for sentence in sentences: #sid, story = sentence.id.split(maxsplit=1) story = "_" sid = sentence.id.split()[0] matrix = sentence.make_matrix("scope", label=True, w2i=w2i) if semcue: cmatrix = sentence.make_matrix("sem", label=True, w2i=w2i) cues = [i for i in range(len(cmatrix)) if cmatrix[0,i] == w2i["cue"] and not w2i["mwc"] in cmatrix[:,i]] else: #cues = [i for i in range(len(matrix)) if matrix[0,i] == w2i["cue"] and not w2i["mwc"] in matrix[:,i]] cues = [i for i in range(len(matrix)) if matrix[0,i] == w2i["cue"]] #for h in range(len(matrix)): # if sum(matrix[h,:]) > 0: # cues.append(h) if len(cues) > 0: # cue scope event for word in sentence: negs = ["_", "_", "_"] * len(cues) for i,c in enumerate(cues): p = len(cues) - 1 - i p = i myev = word.form if c == word.id: is_incue, mycue, myev = check_cue(word.form) if not is_incue: mycue = word.form negs[3*p] = mycue if matrix[c,word.id] == w2i["event"]: negs[3*p+1] = myev negs[3*p+2] = myev elif matrix[c,word.id] == w2i["scope"]: negs[3*p+1] = myev#word.form elif matrix[c,word.id] == w2i["mwc"]: negs[3*p+0] = myev#word.form elif semcue: if cmatrix[c,word.id] == w2i["mwc"]: negs[3*p+0] = myev#word.form print("\t".join([story, sid, str(word.id-1), word.form, word.lemma, word.xpos, "_", *negs])) else: for word in sentence: print("\t".join([story, sid, str(word.id-1), word.form, word.lemma, word.xpos, "_", "***"])) print()
def conllup_to_epe(fname): import col_data as cd s = 0 for sentence in cd.read_col_data(fname): sid, story = sentence.id.split(maxsplit=1) epe = {"id": sid, "nodes": []} cues = {} nodes = epe["nodes"] c = 0 for token in sentence: node = {"id": token.id, "form": token.form, "start": s, "end": s + len(token.form), "properties": {"xpos": token.xpos, "upos": token.upos, "lemma": token.lemma}, "edges": [], "negation": []} s += len(token.form) + 1 nodes.append(node) if "cue" in [l for h,l in token.scope] and not "mwc" in [l for h,l in token.scope]: if token.id not in cues: cues[token.id] = c c += 1 elif "cue" in [l for h,l in token.scope] and "mwc" in [l for h,l in token.scope]: try: cues[token.id] = cues[[h for h,l in token.scope if l == "mwc"][0]] except KeyError: cues[[h for h,l in token.scope if l == "mwc"][0]] = c c += 1 cues[token.id] = cues[[h for h,l in token.scope if l == "mwc"][0]] for token in sentence: if token.head == 0: nodes[token.id-1]["top"] = True elif token.head > 0: nodes[token.head-1]["edges"].append({"label": token.deprel, "target": token.id}) for h,l in token.deps: nodes[h-1]["edges"].append({"label": l, "target": token.id}) for h,l in token.scope: if h == token.id: is_incue, mycue, myev = check_cue(token.form) if not is_incue: mycue = token.form myev = token.form if l == "scope": nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": mycue, "scope": myev}) elif l == "event": nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": mycue, "scope": myev, "event": myev}) elif l == "cue": #print((cues[token.id], [x["id"] for x in nodes[token.id-1]["negation"]])) if not (cues[token.id] in [x["id"] for x in nodes[token.id-1]["negation"]]): nodes[token.id-1]["negation"].append({"id": cues[token.id], "cue": token.form}) elif l == "scope": nodes[token.id-1]["negation"].append({"id": cues[h], "scope": token.form}) elif l == "event": nodes[token.id-1]["negation"].append({"id": cues[h], "scope": token.form, "event": token.form}) print(json.dumps(epe))
def predict(model, settings, to_predict, elmo, vocabs): pred_path = settings.dir + to_predict.split("/")[-1] + ".pred" entries, predicted, other_predicted = model.predict(to_predict, elmo) f1, _ = sc.score(*zip(*((entry[1][settings.pt].numpy(), predicted[entry[0]].numpy()) for entry in entries))) print("F1 is {:.2%}".format(f1)) if len(other_predicted) > 0: other_f1, _ = sc.score(*zip(*((entry[1][settings.ot].numpy(), other_predicted[entry[0]].numpy()) for entry in entries))) print("Other F1 is {:.2%}".format(other_f1)) with open(pred_path, "w") as fh: for sentence in cd.read_col_data(to_predict): pred = predicted[sentence.id].numpy() if settings.target_style == "scope-": cue_matrix = sentence.make_matrix("cues", True, vocabs[settings.td["cue"]].w2i) pred = np.maximum(pred, cue_matrix) #pred = other_predicted[sentence.id].numpy() sentence.update_parse(pred, settings.target_style, vocabs[settings.pt].i2w) if len(other_predicted) > 0: pred = other_predicted[sentence.id].numpy() # NOTE sem == sem hopefully if settings.target_style == settings.other_target_style: sentence.update_parse(pred, "syn", vocabs[settings.pt].i2w) else: sentence.update_parse(pred, settings.other_target_style, vocabs[settings.pt].i2w) print(sentence, file=fh) return True
def _load_data(self, data_path, pos_style, target_style, other_target_style, elmo): print("Loading data from {}".format(data_path)) data = cd.read_col_data(data_path) #with h5py.File(elmo, 'r') as f: # for sen in f: # #print(sen) # for word, vec in zip(sen.split("\t"), f[sen]): # print(word, vec) if self.use_elmo: felmo = h5py.File(elmo, "r") self.index_entries = [] for sentence in data: #print(sentence.id) #print(len(sentence), len(felmo[sentence.id])) if self.use_elmo: self.index_entries.append( IndexEntry(sentence, self.vocabs, self.external, self.settings, felmo[sentence.id], self.vec_dim)) else: self.index_entries.append( IndexEntry(sentence, self.vocabs, self.external, self.settings, None)) if self.use_elmo: felmo.close() print("Done")
def coldata_to_starsem(fn_in: str, fn_out: str) -> None: with open(fn_out, "w") as fh_out: for sentence in read_col_data(fn_in): # print(sentence, file=fh_out) negs = negations_from_matrix(negation_matrix(sentence), sentence) out = [] for token in sentence: out.append( f"_\t{sentence.id}\t{token.id-1}\t{token.form}\t_\t_\t_\t") if not negs: out.append("***\n") else: out_neg = [] for neg in negs.values(): if token.id in neg["Cue"]: if token.form not in no_affix: form, span = affixer(token.form) else: form = token.form out_neg.append(form) else: out_neg.append("_") if token.id in neg["Scope"] and \ token.id not in neg["Cue"]: # cue in other negation's scope form = token.form out_neg.append(form) elif token.id in neg["Cue"]: form, (start, end) = affixer(token.form) if token.form in no_affix: form = "_" elif start == 0 and end != len( token.form): # prefix # print(sentence.id, end="\t") # print("prefix", token.form, form, span, neg) # print([(t.id, t.form) for t in sentence]) form = token.form[end:] elif start != 0 and end == len( token.form): # suffix # print(sentence.id, end="\t") # print("suffix", token.form, form, span, neg) # print([(t.id, t.form) for t in sentence]) form = token.form[:start] else: form = "_" out_neg.append(form) else: out_neg.append("_") out_neg.append("_") # event remnant out.append("\t".join(out_neg)) out.append("\n") print("".join(out), file=fh_out)
for i in range(n): for j in range(n): C[int(gl[i, j]), int(pl[i, j])] += 1 print(C) for i in range(len(C)): print(i2w[i]) fscore(i, C) #for j in range(len(C)): # print("\t", i2w[j], C[i,j]) if __name__ == "__main__": import col_data as cd import vocab as vcb import sys try: with open("vocabs.pk", "rb") as fh: vocabs = pickle.load(fh) except FileNotFoundError: train = cd.read_col_data(sys.argv[1]) _vocabs = vcb.make_vocabs(train, 0) vocabs = vcb.Vocabs(*_vocabs) gold = cd.read_col_data(sys.argv[2]) pred = cd.read_col_data(sys.argv[3]) gms = [g.make_matrix("scope", True, vocabs.scoperels.w2i) for g in gold] pms = [p.make_matrix("scope", True, vocabs.scoperels.w2i) for p in pred] confuse(gms, pms, vocabs.scoperels.i2w)
def run_parser(args): # For now, assume there always is train, val, and glove data if args.seed == -1: args.seed = np.random.randint(1234567890) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) device = torch.device("cuda:0" if torch.cuda.is_available() and not args.force_cpu else "cpu") print(device) args.device = device if torch.cuda.is_available(): print(torch.cuda.get_device_capability(device)) args.td = {None: 0, "syn": 1, "sem": 2, "cue": 3, "scope": 4, "scope-": 5} args.ot = args.td[args.other_target_style] args.pt = args.td[args.target_style] args.helpers = None if args.help_style: args.helpers = [args.td[x] for x in args.help_style.split(",")] if not args.dir.endswith("/"): args.dir += "/" if args.load: with open(args.dir + "vocabs.pk", "rb") as fh: vocabs = pickle.load(fh) #args.vocabs = vocabs model = ModelInteractor.factory(args, vocabs) model.load(args.load) else: sentences = cd.read_col_data(args.train) if args.vocab is not None: with open(args.vocab, "rb") as fh: vocabs = pickle.load(fh) else: _vocabs = make_vocabs(sentences, 0) vocabs = Vocabs(*_vocabs) with open(args.dir + "vocabs.pk", "wb") as fh: pickle.dump(vocabs, fh) #args.vocabs = vocabs model = ModelInteractor.factory(args, vocabs) if args.recycle is not None: with open(args.recycle + "vocabs.pk", "rb") as fh: other_vocabs = pickle.load(fh) with open(args.recycle + "settings.json") as fh: other_settings = json.load(fh) other_settings = Namespace(**other_settings) other_settings.device = args.device other = ModelInteractor.factory(other_settings, other_vocabs) other.load(args.recycle + "best_model.save") model.upd_from_other(other, *args.recycle_layers.split(",")) if args.freeze is not None: model.freeze_params(*args.freeze.split(",")) if (args.load and args.cont) or args.load is None: model.train() # load the best_model.save instead of using the current one model = ModelInteractor.factory(args, vocabs) model.load(args.dir + "best_model.save") if (args.load and args.cont) or args.load is None: predict(model, args, args.val, args.elmo_dev, vocabs) predict(model, args, args.predict_file, args.elmo_test, vocabs)
return len(self.rels) def __getitem__(self, index): return self.rels[index] def __setitem__(self, index, value): self.rels[index] = value if __name__ == "__main__": import sys import col_data as cd import pickle sentences = [] for fn in sys.argv[2:]: sentences.extend(cd.read_col_data(fn)) #sentences = cd.read_col_data(sys.argv[1]) forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels = make_vocabs( sentences) print([ len(v.w2i) for v in [ forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels ] ]) vocabs = Vocabs(forms, norms, lemmas, uposs, xposs, synrels, semrels, chars, scoperels) #print(synrels.w2i, semrels.w2i, scoperels.w2i) with open(sys.argv[1], "wb") as fh: pickle.dump(vocabs, fh)
for scope in scopes.values(): scope = sorted(scope) #print(scope) for i, j in enumerate(scope): if i < len(scope) - 1: next_n = scope[i + 1] dist = next_n - j #print(dist) if dist > 1: return True return False if __name__ == "__main__": gold = dict([(l.id, l) for l in cd.read_col_data( "../data/neg_graphs/point_to_root/test.conllu")]) pred = dict([(l.id, l) for l in cd.read_col_data( "../experiments/point_to_root/2/test.conllu.pred")]) gold_neg = get_only_negated(gold) pred_neg = get_only_negated(pred) # which cues in gold but not predicted? print("Which cues in gold but not predicted?") gmissed, gsids = which_gold_cues_are_missed(gold_neg, pred) for cue, count in gmissed.most_common(): print("-- {}:{}".format(cue, count)) # --subquestion: does the model EVER predict these cues? never_predicted = [cue for cue in gmissed] for sent_id, sent in pred_neg.items():
# Find which experiments have been run #experiment_names = set(name_map.keys()) #experiments_run = set(os.listdir(args.preddir)) #to_check = experiment_names.intersection(experiments_run) for setup in args.experiments: metric = [] metric.append("") metric.append(name_map[setup]) #goldfile = os.path.join(args.golddir, setup, "test.conllu") #predfile = os.path.join(args.preddir, setup, "test.conllu.pred") goldfile = os.path.join(args.golddir, setup, "dev.conllu") predfile = os.path.join(args.preddir, setup, "dev.conllu.pred") gold = list(cd.read_col_data(goldfile)) pred = list(cd.read_col_data(predfile)) for label in ["holder", "targ", "exp"]: prec, rec, f1 = span_f1(gold, pred, mapping, test_label=label) metric.append(f1 * 100) #print("{0}: {1:.1f}".format(label, f1 * 100)) lgold = read_labeled(goldfile) lpred = read_labeled(predfile) ugold = read_unlabeled(goldfile) upred = read_unlabeled(predfile) #print("Targeted F1") f1 = targeted_f1(lgold, lpred) metric.append(f1 * 100)