Пример #1
0
    def __init__(self, loc=MOHX_LOCATION):
        self.instances, self.words = [], []

        c = 0
        for line in open(loc).readlines()[1:]:
            sentence = Corpus.Sentence()
            data = line.split(",")
            sentence.id = str(c)
            c += 1
            word_data = data[3].split()

            for i in range(len(word_data)):
                met = "N"
                if i == int(data[-2]):
                    met = "tag-" + data[-1].strip()
                w = Corpus.Word(text=word_data[i],
                                met=met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, MOHX_DEPS, lex_field=1)
Пример #2
0
    def __init__(self):
        super().__init__()
        self.instances, self.words = [], []
        lemmatizer = WordNetLemmatizer()
        cur_verb, cluster = "", ""

        for line in open(TROFI_LOCATION).readlines():
            if re.match(r"\*\*\*[a-z]", line):
                cur_verb = line.split("***")[1]
                continue
            elif "*" in line or not line.strip():
                if "literal" in line:
                    cluster = "literal"
                elif "nonliteral" in line:
                    cluster = "nonliteral"
                continue

            sentence = Corpus.Sentence()
            data = line.strip().split("\t")
            sentence.id = data[0]

            met = ""
            if "N" in data[1]:
                met = "met"
            if "L" in data[1]:
                met = "N"
            if "U" in data[1]:
                met = "?"

            for i in range(len(data[2].split())):
                word = data[2].split()[i]
                v_lem = lemmatizer.lemmatize(word, "v")
                cur_met = "N"
                if v_lem == cur_verb:
                    cur_met = "tag-" + met
                w = Corpus.Word(text=word,
                                met=cur_met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, TROFI_DEPS, lex_field=1)
Пример #3
0
    def __init__(self, lcc_instance_node):
        super().__init__()
        self.target_cm = [lcc_instance_node.get('targetConcept')]
        annotations_element = lcc_instance_node.find(".//Annotations")

        met_anns = annotations_element.find(".//MetaphoricityAnnotations")
        self.met_score = sum([float(m.get('score'))
                              for m in met_anns]) / len(met_anns)

        cm_source_anns = annotations_element.find(".//CMSourceAnnotations")
        self.source_cm = []
        if cm_source_anns is not None:
            self.source_cm = set([(cm.get("sourceConcept"),
                                   float(cm.get("score")))
                                  for cm in cm_source_anns
                                  if float(cm.get('score')) >= 0])

        self.chain = lcc_instance_node.get('chain')
        self.id = lcc_instance_node.get('id')

        all_text = lcc_instance_node.find(".//TextContent")
        self.current_text = all_text.find(".//Current")
        self.prev_text = all_text.find(".//Prev")
        self.next_text = all_text.find(".//Next")

        self.source_lm = self.current_text.find(".//LmSource").text.strip()
        self.target_lm = self.current_text.find(".//LmTarget").text.strip()

        i = 0
        all_words = []
        for word_group in self.current_text.itertext():
            if word_group.strip() == self.source_lm:
                met = ["source", self.source_cm, self.met_score]
            elif word_group.strip() == self.target_lm:
                met = ["target", self.target_cm, self.met_score]
            else:
                met = ["N", "", ""]

            for w in [
                    w for w in re.findall(r"[\w']+|[.,?!;:\"']", word_group)
                    if w != "="
            ]:
                self.words.append(
                    Corpus.Word(text=w, met=met, index=i, sentence=self))
                i += 1
Пример #4
0
    def __init__(self, corpus_location):
        self.instances, self.words = [], []
        data = csv.reader(open(corpus_location))
        next(data)
        for line in data:
            sentence = Corpus.Sentence()
            sentence.id = line[1]

            index = int(line[-2])
            tag = int(line[-1])

            sent_data = line[3].split()
            for i in range(len(sent_data)):
                word = sent_data[i]
                met = "N"
                if i == index:
                    met = "met"
                w = Corpus.Word(text=word, sentence=sentence, met=met, index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)
Пример #5
0
def load_vuamc_csv(filename=VUAMC_CSV):
    with codecs.open(filename, encoding="latin-1", errors='replace') as f:
        data = [line for line in csv.reader(f)]

    sentences = []
    all_words = []

    for sent_index in range(1, len(data[1:])):
        line_data = data[sent_index]
        if not line_data:
            continue
        sentence = Corpus.Sentence()

        sentence.source_file = line_data[0]

        if sentence.source_file in ACADEMIC:
            sentence.domain = "academic"
        elif sentence.source_file in CONVERSATION:
            sentence.domain = "conversation"
        elif sentence.source_file in FICTION:
            sentence.domain = "fiction"
        elif sentence.source_file in NEWS:
            sentence.domain = "news"
        sentence.id = line_data[1]

        words = line_data[2]

        j = 0
        for i in range(0, len(words.split())):
            w_data = words.split()[i].split(";;")
            if "M_" in w_data[-1]:
                met = "met"
                word_text = w_data[-1][2:]
            else:
                met = "N"
                word_text = w_data[-1]

            pos = w_data[0]
            lemma = w_data[1]

            for extra_words in word_text.split("_"):
                if not set(extra_words).intersection(
                        str(string.punctuation + string.ascii_letters +
                            string.digits)):
                    sentence.words.append(
                        Corpus.Word(text="none",
                                    met="none",
                                    pos="none",
                                    lemma="none",
                                    sentence=sentence,
                                    index=j))
                    j += 1
                    continue

                word = Corpus.Word(text=extra_words,
                                   met=met,
                                   pos=pos,
                                   lemma=lemma,
                                   sentence=sentence,
                                   index=j)

                sentence.words.append(word)
                all_words.append(word)
                j += 1
        sentences.append(sentence)

    Corpus.add_dependencies(sentences, VUAMC_DEPS)
    Corpus.add_vn_parse(sentences, VUAMC_VN)
    Corpus.add_allen_parse(sentences, VUAMC_ALLEN)
    #Corpus.populate_vn_from_heads(sentences)

    return sentences, all_words