def convert_syn_to_seq(line_data_dict): sent_type = "syn" sent_no = 0 res = [] for line_data_key in sorted(line_data_dict.keys()): for line_data in line_data_dict[line_data_key]: words = clean_line(line_data[0]) index = int(line_data[1])-1 if words[index] == "None": print (words, index) continue if PATTERN_DICT[line_data_key] == 1: tags = [0 if i != index else 1 for i in range(len(words))] else: tags = [0] * len(words) tagged_words = [words[i] if tags[i] == 0 else "M_" + words[i] for i in range(len(words))] poses = [POS_DICT[p[1]] for p in SDP.tag_sentence(words)] res.append((sent_type, str(sent_no), " ".join(words), tags, poses, " ".join(tagged_words), sent_type)) sent_no += 1 if sent_no % 10 == 0: print (sent_no) return res
def parse_and_save(self, output): res = {} c = 1 for sent in self.instances: try: parse = list(SDP.parse_sentence( sent.text()))[0].to_conll(style=10) try: res[sent.source_file + "-" + sent.id] = parse except: res[sent.id] = parse except Exception as e: print(e, sent.text()) if c % 100 == 0: print(str(c) + " done of " + str(len(self.instances))) c += 1 json.dump(res, open(output + ".json", "w"))
def get_corpus_frames(corpus): fp = FramePredictor() data = {} count = 0 for sentence in corpus.instances: clauses = SDP.trim_clauses(sentence.text()) count += 1 for c in clauses: for w in sentence.words: if w.text == c[0][1]: if w.lemma not in data: data[w.lemma] = {"L":{k:0 for k in fp.model.classes_}, "M":{k:0 for k in fp.model.classes_}} pred = fp.predict(c[1], vnc=w.vnc) if w.met != "N": if pred not in data[w.lemma]["M"]: data[w.lemma]["M"][pred] = 0 data[w.lemma]["M"][pred] += 1 else: if pred not in data[w.lemma]["L"]: data[w.lemma]["L"][pred] = 0 data[w.lemma]["L"][pred] += 1 #print all the possible frames print ("-," + ",".join([cl for cl in sorted(fp.model.classes_)])) #sum up the counts for each frame l_frame_sums = [] m_frame_sums = [] for k in sorted(fp.model.classes_): l_frame_sums.append(sum([data[key]["L"][k] for key in data.keys()])) m_frame_sums.append(sum([data[key]["M"][k] for key in data.keys()])) #print sum of the literal and metaphor counts by frame print (str(sum(l_frame_sums)) + "," + ",".join([str(s) for s in l_frame_sums])) print (str(sum(m_frame_sums)) + "," + ",".join([str(s) for s in m_frame_sums])) #sort keys by their total counts, then print their sums and counts by frame for key in sorted(list(data.keys()), key=lambda x: sum(data[x]["L"].values()) + sum(data[x]["M"].values()), reverse=True): print (key) print (str(sum(data[key]["L"].values())) + "," + ",".join([str(data[key]["L"][v]) for v in sorted(fp.model.classes_)])) print (str(sum(data[key]["M"].values())) + "," + ",".join([str(data[key]["M"][v]) for v in sorted(fp.model.classes_)]))
def convert_vn_to_seq(elmo_lines): sent_type = "vn" sent_no = 0 all_pos = set() csv_out = [] d = {} for line in elmo_lines: okay = True sent_words, tags, tagged_words = [], [], [] line_data = line.split() for word in line_data: if re.match(VN_RE, word): word, vnc = word.split("_") vnc = vnc.split("-")[0] if vnc in METS: tags.append(1) tagged_words.append("M_" + word) elif vnc in LITS: tags.append(0) tagged_words.append(word) else: okay = False break else: tags.append(0) tagged_words.append(word) sent_words.append(word) if okay: poses = [POS_DICT[p[1]] for p in SDP.tag_sentence(line_data)] all_pos |= set(poses) csv_out.append((sent_type, str(sent_no), " ".join(sent_words), tags, poses, " ".join(tagged_words), sent_type)) if sent_no % 10 == 0: print (sent_no) sent_no += 1 write_output(csv_out, "VUA_seq_formatted_vn_extra.csv")
def parse_frames(self, vn, load=True, secondary_information=False, filename="vn_frame_parses.json"): if load: self.frame_parses = json.load(open(filename)) else: res = {} for vnc in vn.get_verb_classes(): if vnc.numerical_ID not in res: res[vnc.numerical_ID] = {} for frame in vnc.frames: # take out secondary information if not secondary_information: label_data = [re.split("[._-]", e)[0] for e in frame.primary if "recipient" not in e and "topic" not in e and "attribute" not in e] else: label_data = frame.primary # condense wh- words label_data = " ".join(["WH" if w.startswith("wh") else w for w in label_data]) for e in frame.examples: parse = [item.split("\t") for item in list(SDP.parse_sentence(e))[0].to_conll(style=4).split("\n")] if label_data not in res[vnc.numerical_ID]: res[vnc.numerical_ID][label_data] = [] res[vnc.numerical_ID][label_data].append(parse) json.dump(res, open(filename, "w")) self.frame_parses = res
def match_line(patterns, line, verb): parse = SDP.parse_sentence(line) verb_node = None conll_data = [p.split() for p in list(parse)[0].to_conll(style=10).split("\n")] for word_data in conll_data: if len(word_data) > 1 and lemmatizer.lemmatize(word_data[1], "v") == verb and word_data[3][0] == "V": verb_node = word_data if not verb_node: return {} neg, nmod, adj, pro_vp, adv, nsubj, wh, dobj, subj, to_phrase, passive, verb_compliment, pn_subj_of_complement, to_be_adj, that_comp, by_np, to_be = False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False to_np, pro_found, to_found, be_found, that_found, by_found, last_dobj = False, False, False, False, False, False, False prep_results = {p: False for p in PREPS} for i in range(len(conll_data)): word_data = conll_data[i] if len(word_data) > 1: if word_data[1] == "to": to_found = word_data[-4] if word_data[1] == "be": be_found = word_data[-4] if word_data[1] == "that": that_found = word_data[-4] if word_data[1] == "by": by_found = word_data[-4] if word_data[3] in ["PRP", "PRP$"]: pro_found = word_data[-4] if word_data[-4] == verb_node[0]: # a direct dependent if "dobj" == word_data[-3]: dobj = True last_dobj = word_data if "nsubj" == word_data[-3]: nsubj = True if word_data[3][0] == "W": wh = True if word_data[0] == to_found: to_phrase = True if word_data[3][0] == "N": to_np = True if word_data[0] == that_found: that_comp = True if word_data[0] == by_found: by_np = True if "pass" in word_data[-3]: passive = True if word_data[3] == "RB": adv = True if word_data[3] == "JJ": adj = True if word_data[3][0] == "V" and word_data[0] == pro_found: pro_vp = True if word_data[-3] == "nmod": nmod = True if word_data[-3] == "neg": neg = True if last_dobj and word_data[-4] == last_dobj[0]: if word_data[0] == to_found and word_data[0] == be_found: to_be = True if word_data[3][0] == "J": to_be_adj = True for p in PREPS: try: if p == word_data[1] and (word_data[-4] == verb_node[0] or conll_data[int(word_data[-4])-1][-4] == verb_node[0]): prep_results[p] = True except IndexError as e: print (e) continue res = {} for pattern_set in patterns: matches = True if not pattern_set or len(pattern_set) < 1: raise Exception("We don't have a rule for this pattern: " + pattern_set) for pattern in pattern_set.split(): if pattern not in RECOGNIZED_PATTERNS: raise Exception ("We don't have a rule for this pattern: " + pattern) if pattern == "to" and not to_phrase: matches = False if pattern == "~to" and to_phrase: matches = False if pattern == "pass" and not passive: matches = False if pattern == "~pass" and passive: matches = False if pattern == "dobj" and not dobj: matches = False if pattern == "~dobj" and dobj: matches = False if pattern == "nsubj" and not nsubj: matches = False if pattern == "~nsubj" and nsubj: matches = False if pattern == "to-be-adj" and not to_be_adj: matches = False if pattern == "that-comp" and not that_comp: matches = False if pattern == "by-np" and not by_np: matches = False if pattern == "wh" and not wh: matches = False if pattern == "to-be" and not to_be: matches = False if pattern == "adv" and not adv: matches = False if pattern == "adj" and not adj: matches = False if pattern == "pro-vp" and not pro_vp: matches = False if pattern == "nmod" and not nmod: matches = False if pattern == "to-np" and not to_np: matches = False if pattern == "neg" and not neg: matches = False for p in PREPS: if pattern == p and not prep_results[p]: matches = False if matches: res[verb + "-" + pattern_set] = line.strip() + ";;" + verb_node[0] + "\n" return res
# FITNESS FOR A PARTICULAR PURPOSE. # # See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html> # for more details. import sys import commands import os sys.path += ["../../../../ERT-Statoil/OldStuff/python/ctypes/SDP"] import SDP local_ert = "ert" svn_version = commands.getoutput( "svnversion" ) try: numeric = int( svn_version ) except: sys.exit("Will not install svn version:%s - must have a pure checkout") svn_ert = "%s_%s" % (local_ert , svn_version) (SDP_ROOT , RH_version) = SDP.get_SDP_ROOT() target_file = "%s/bin/ert_release/%s" % (SDP_ROOT, svn_ert) ert_link = "%s/bin/ert_latest_and_greatest" % SDP_ROOT SDP.install_file( local_ert , target_file ) SDP.install_link( target_file , ert_link )
# See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html> # for more details. import sys import os.path import os import re from stat import * import shutil sys.path += ["../../../../python/ctypes/SDP"] import SDP ################################################################# (SDP_ROOT , RH) = SDP.get_SDP_ROOT() python_root = "%s/lib/python" % SDP_ROOT lib_root = "%s/lib/python/lib" % SDP_ROOT SDP.install_file("../../../../libutil/slib/libutil.so" , "%s/libutil.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libecl/slib/libecl.so" , "%s/libecl.so" % lib_root , strict_exists = False) SDP.install_file("../../../../librms/slib/librms.so" , "%s/librms.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libenkf/slib/libenkf.so" , "%s/libenkf.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libconfig/slib/libconfig.so" , "%s/libconfig.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libjob_queue/slib/libjob_queue.so" , "%s/libjob_queue.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libplot/slib/libplot.so" , "%s/libplot.so" % lib_root , strict_exists = False) SDP.install_file("../../../../libsched/slib/libsched.so" , "%s/libsched.so" % lib_root , strict_exists = False) SDP.make_dir( "%s/gert" % python_root ) SDP.install_path( "code" , "%s/gert" % python_root , root = "../" , extensions = ["py"]) SDP.install_path( "help" , "%s/gert" % python_root , root = "../" )
# (at your option) any later version. # # ERT is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. # # See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html> # for more details. import sys import commands import os sys.path += ["../../../../ERT-Statoil/OldStuff/python/ctypes/SDP"] import SDP local_ert = "ert" svn_version = commands.getoutput("svnversion") try: numeric = int(svn_version) except: sys.exit("Will not install svn version:%s - must have a pure checkout") svn_ert = "%s_%s" % (local_ert, svn_version) (SDP_ROOT, RH_version) = SDP.get_SDP_ROOT() target_file = "%s/bin/ert_release/%s" % (SDP_ROOT, svn_ert) ert_link = "%s/bin/ert_latest_and_greatest" % SDP_ROOT SDP.install_file(local_ert, target_file) SDP.install_link(target_file, ert_link)