Exemplo n.º 1
0
def convert_syn_to_seq(line_data_dict):
    sent_type = "syn"
    sent_no = 0
    res = []

    for line_data_key in sorted(line_data_dict.keys()):
        for line_data in line_data_dict[line_data_key]:
            words = clean_line(line_data[0])
            index = int(line_data[1])-1
            if words[index] == "None":
                print (words, index)
                continue
            if PATTERN_DICT[line_data_key] == 1:
                tags = [0 if i != index else 1 for i in range(len(words))]
            else:
                tags = [0] * len(words)

            tagged_words = [words[i] if tags[i] == 0 else "M_" + words[i] for i in range(len(words))]

            poses = [POS_DICT[p[1]] for p in SDP.tag_sentence(words)]
            res.append((sent_type, str(sent_no), " ".join(words), tags, poses, " ".join(tagged_words), sent_type))
            sent_no += 1
            if sent_no % 10 == 0:
                print (sent_no)
    return res
Exemplo n.º 2
0
 def parse_and_save(self, output):
     res = {}
     c = 1
     for sent in self.instances:
         try:
             parse = list(SDP.parse_sentence(
                 sent.text()))[0].to_conll(style=10)
             try:
                 res[sent.source_file + "-" + sent.id] = parse
             except:
                 res[sent.id] = parse
         except Exception as e:
             print(e, sent.text())
         if c % 100 == 0:
             print(str(c) + " done of " + str(len(self.instances)))
         c += 1
     json.dump(res, open(output + ".json", "w"))
Exemplo n.º 3
0
def get_corpus_frames(corpus):
    fp = FramePredictor()
    data = {}
    count = 0
    for sentence in corpus.instances:
        clauses = SDP.trim_clauses(sentence.text())
        count += 1
        for c in clauses:
            for w in sentence.words:
                if w.text == c[0][1]:
                    if w.lemma not in data:
                        data[w.lemma] = {"L":{k:0 for k in fp.model.classes_}, "M":{k:0 for k in fp.model.classes_}}

                    pred = fp.predict(c[1], vnc=w.vnc)
                    if w.met != "N":
                        if pred not in data[w.lemma]["M"]:
                            data[w.lemma]["M"][pred] = 0
                        data[w.lemma]["M"][pred] += 1
                    else:
                        if pred not in data[w.lemma]["L"]:
                            data[w.lemma]["L"][pred] = 0
                        data[w.lemma]["L"][pred] += 1

    #print all the possible frames
    print ("-," + ",".join([cl for cl in sorted(fp.model.classes_)]))


    #sum up the counts for each frame
    l_frame_sums = []
    m_frame_sums = []
    for k in sorted(fp.model.classes_):
        l_frame_sums.append(sum([data[key]["L"][k] for key in data.keys()]))
        m_frame_sums.append(sum([data[key]["M"][k] for key in data.keys()]))

    #print sum of the literal and metaphor counts by frame
    print (str(sum(l_frame_sums)) + "," + ",".join([str(s) for s in l_frame_sums]))
    print (str(sum(m_frame_sums)) + "," + ",".join([str(s) for s in m_frame_sums]))

    #sort keys by their total counts, then print their sums and counts by frame
    for key in sorted(list(data.keys()), key=lambda x: sum(data[x]["L"].values()) + sum(data[x]["M"].values()), reverse=True):
        print (key)
        print (str(sum(data[key]["L"].values())) + "," + ",".join([str(data[key]["L"][v]) for v in sorted(fp.model.classes_)]))
        print (str(sum(data[key]["M"].values())) + "," + ",".join([str(data[key]["M"][v]) for v in sorted(fp.model.classes_)]))
Exemplo n.º 4
0
def convert_vn_to_seq(elmo_lines):
    sent_type = "vn"

    sent_no = 0
    all_pos = set()
    csv_out = []
    d = {}
    for line in elmo_lines:
        okay = True
        sent_words, tags, tagged_words = [], [], []

        line_data = line.split()
        for word in line_data:
            if re.match(VN_RE, word):
                word, vnc = word.split("_")
                vnc = vnc.split("-")[0]

                if vnc in METS:
                    tags.append(1)
                    tagged_words.append("M_" + word)
                elif vnc in LITS:
                    tags.append(0)
                    tagged_words.append(word)
                else:
                    okay = False
                    break
            else:
                tags.append(0)
                tagged_words.append(word)

            sent_words.append(word)

        if okay:
            poses = [POS_DICT[p[1]] for p in SDP.tag_sentence(line_data)]
            all_pos |= set(poses)
            csv_out.append((sent_type, str(sent_no), " ".join(sent_words), tags, poses, " ".join(tagged_words), sent_type))
            if sent_no % 10 == 0:
                print (sent_no)
            sent_no += 1
    write_output(csv_out, "VUA_seq_formatted_vn_extra.csv")
Exemplo n.º 5
0
    def parse_frames(self, vn, load=True, secondary_information=False, filename="vn_frame_parses.json"):
        if load:
            self.frame_parses = json.load(open(filename))
        else:
            res = {}
            for vnc in vn.get_verb_classes():
                if vnc.numerical_ID not in res:
                    res[vnc.numerical_ID] = {}
                for frame in vnc.frames:
                    # take out secondary information
                    if not secondary_information:
                        label_data = [re.split("[._-]", e)[0] for e in frame.primary if "recipient" not in e and "topic" not in e and "attribute" not in e]
                    else:
                        label_data = frame.primary
                    # condense wh- words
                    label_data = " ".join(["WH" if w.startswith("wh") else w for w in label_data])

                    for e in frame.examples:
                        parse = [item.split("\t") for item in list(SDP.parse_sentence(e))[0].to_conll(style=4).split("\n")]
                        if label_data not in res[vnc.numerical_ID]:
                            res[vnc.numerical_ID][label_data] = []
                        res[vnc.numerical_ID][label_data].append(parse)
            json.dump(res, open(filename, "w"))
            self.frame_parses = res
Exemplo n.º 6
0
def match_line(patterns, line, verb):
    parse = SDP.parse_sentence(line)
    verb_node = None
    conll_data = [p.split() for p in list(parse)[0].to_conll(style=10).split("\n")]

    for word_data in conll_data:
        if len(word_data) > 1 and lemmatizer.lemmatize(word_data[1], "v") == verb and word_data[3][0] == "V":
            verb_node = word_data

    if not verb_node:
        return {}

    neg, nmod, adj, pro_vp, adv, nsubj, wh, dobj, subj, to_phrase, passive, verb_compliment, pn_subj_of_complement, to_be_adj, that_comp, by_np, to_be = False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False
    to_np, pro_found, to_found, be_found, that_found, by_found, last_dobj = False, False, False, False, False, False, False

    prep_results = {p: False for p in PREPS}

    for i in range(len(conll_data)):
        word_data = conll_data[i]

        if len(word_data) > 1:
            if word_data[1] == "to":
                to_found = word_data[-4]
            if word_data[1] == "be":
                be_found = word_data[-4]
            if word_data[1] == "that":
                that_found = word_data[-4]
            if word_data[1] == "by":
                by_found = word_data[-4]
            if word_data[3] in ["PRP", "PRP$"]:
                pro_found = word_data[-4]

            if word_data[-4] == verb_node[0]:      # a direct dependent
                if "dobj" == word_data[-3]:
                    dobj = True
                    last_dobj = word_data
                if "nsubj" == word_data[-3]:
                    nsubj = True
                if word_data[3][0] == "W":
                    wh = True
                if word_data[0] == to_found:
                    to_phrase = True
                    if word_data[3][0] == "N":
                        to_np = True
                if word_data[0] == that_found:
                    that_comp = True
                if word_data[0] == by_found:
                    by_np = True
                if "pass" in word_data[-3]:
                    passive = True
                if word_data[3] == "RB":
                    adv = True
                if word_data[3] == "JJ":
                    adj = True
                if word_data[3][0] == "V" and word_data[0] == pro_found:
                    pro_vp = True
                if word_data[-3] == "nmod":
                    nmod = True
                if word_data[-3] == "neg":
                    neg = True

            if last_dobj and word_data[-4] == last_dobj[0]:
                if word_data[0] == to_found and word_data[0] == be_found:
                        to_be = True
                        if word_data[3][0] == "J":
                            to_be_adj = True


            for p in PREPS:
                try:
                    if p == word_data[1] and (word_data[-4] == verb_node[0] or conll_data[int(word_data[-4])-1][-4] == verb_node[0]):
                        prep_results[p] = True
                except IndexError as e:
                    print (e)
                    continue
    res = {}

    for pattern_set in patterns:
        matches = True
        if not pattern_set or len(pattern_set) < 1:
            raise Exception("We don't have a rule for this pattern: " + pattern_set)

        for pattern in pattern_set.split():
            if pattern not in RECOGNIZED_PATTERNS:
                raise Exception ("We don't have a rule for this pattern: " + pattern)
            if pattern == "to" and not to_phrase:
                matches = False
            if pattern == "~to" and to_phrase:
                matches = False
            if pattern == "pass" and not passive:
                matches = False
            if pattern == "~pass" and passive:
                matches = False
            if pattern == "dobj" and not dobj:
                matches = False
            if pattern == "~dobj" and dobj:
                matches = False
            if pattern == "nsubj" and not nsubj:
                matches = False
            if pattern == "~nsubj" and nsubj:
                matches = False
            if pattern == "to-be-adj" and not to_be_adj:
                matches = False
            if pattern == "that-comp" and not that_comp:
                matches = False
            if pattern == "by-np" and not by_np:
                matches = False
            if pattern == "wh" and not wh:
                matches = False
            if pattern == "to-be" and not to_be:
                matches = False
            if pattern == "adv" and not adv:
                matches = False
            if pattern == "adj" and not adj:
                matches = False
            if pattern == "pro-vp" and not pro_vp:
                matches = False
            if pattern == "nmod" and not nmod:
                matches = False
            if pattern == "to-np" and not to_np:
                matches = False
            if pattern == "neg" and not neg:
                matches = False
            for p in PREPS:
                if pattern == p and not prep_results[p]:
                    matches = False

        if matches:
            res[verb + "-" + pattern_set] = line.strip() + ";;" + verb_node[0] + "\n"

    return res
Exemplo n.º 7
0
#  FITNESS FOR A PARTICULAR PURPOSE.   
#   
#  See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html> 
#  for more details. 


import sys
import commands
import os
sys.path += ["../../../../ERT-Statoil/OldStuff/python/ctypes/SDP"]
import SDP

local_ert   = "ert"
svn_version = commands.getoutput( "svnversion" ) 

try:
    numeric = int( svn_version )
except:
    sys.exit("Will not install svn version:%s - must have a pure checkout")

svn_ert     = "%s_%s" % (local_ert , svn_version)
(SDP_ROOT , RH_version) = SDP.get_SDP_ROOT()
target_file = "%s/bin/ert_release/%s" % (SDP_ROOT, svn_ert)
ert_link    = "%s/bin/ert_latest_and_greatest" % SDP_ROOT

SDP.install_file( local_ert , target_file )
SDP.install_link( target_file , ert_link )



Exemplo n.º 8
0
#  See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html> 
#  for more details. 


import sys
import os.path
import os
import re
from   stat import *
import shutil
sys.path += ["../../../../python/ctypes/SDP"]
import SDP

#################################################################

(SDP_ROOT , RH) = SDP.get_SDP_ROOT()
python_root = "%s/lib/python" % SDP_ROOT
lib_root    = "%s/lib/python/lib"  % SDP_ROOT

SDP.install_file("../../../../libutil/slib/libutil.so"           , "%s/libutil.so" % lib_root      , strict_exists = False)
SDP.install_file("../../../../libecl/slib/libecl.so"             , "%s/libecl.so" % lib_root       , strict_exists = False)
SDP.install_file("../../../../librms/slib/librms.so"             , "%s/librms.so" % lib_root       , strict_exists = False)
SDP.install_file("../../../../libenkf/slib/libenkf.so"           , "%s/libenkf.so" % lib_root      , strict_exists = False)
SDP.install_file("../../../../libconfig/slib/libconfig.so"       , "%s/libconfig.so" % lib_root    , strict_exists = False)
SDP.install_file("../../../../libjob_queue/slib/libjob_queue.so" , "%s/libjob_queue.so" % lib_root , strict_exists = False)
SDP.install_file("../../../../libplot/slib/libplot.so"           , "%s/libplot.so" % lib_root      , strict_exists = False)
SDP.install_file("../../../../libsched/slib/libsched.so"         , "%s/libsched.so" % lib_root     , strict_exists = False)

SDP.make_dir( "%s/gert" % python_root )
SDP.install_path( "code" , "%s/gert" % python_root  ,  root = "../" , extensions = ["py"])
SDP.install_path( "help" , "%s/gert" % python_root  ,  root = "../" )
Exemplo n.º 9
0
Arquivo: install.py Projeto: rolk/ert
#  (at your option) any later version.
#
#  ERT is distributed in the hope that it will be useful, but WITHOUT ANY
#  WARRANTY; without even the implied warranty of MERCHANTABILITY or
#  FITNESS FOR A PARTICULAR PURPOSE.
#
#  See the GNU General Public License at <http://www.gnu.org/licenses/gpl.html>
#  for more details.

import sys
import commands
import os
sys.path += ["../../../../ERT-Statoil/OldStuff/python/ctypes/SDP"]
import SDP

local_ert = "ert"
svn_version = commands.getoutput("svnversion")

try:
    numeric = int(svn_version)
except:
    sys.exit("Will not install svn version:%s - must have a pure checkout")

svn_ert = "%s_%s" % (local_ert, svn_version)
(SDP_ROOT, RH_version) = SDP.get_SDP_ROOT()
target_file = "%s/bin/ert_release/%s" % (SDP_ROOT, svn_ert)
ert_link = "%s/bin/ert_latest_and_greatest" % SDP_ROOT

SDP.install_file(local_ert, target_file)
SDP.install_link(target_file, ert_link)