def __init__(self, loc=MOHX_LOCATION): self.instances, self.words = [], [] c = 0 for line in open(loc).readlines()[1:]: sentence = Corpus.Sentence() data = line.split(",") sentence.id = str(c) c += 1 word_data = data[3].split() for i in range(len(word_data)): met = "N" if i == int(data[-2]): met = "tag-" + data[-1].strip() w = Corpus.Word(text=word_data[i], met=met, sentence=sentence, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence) Corpus.add_dependencies(self.instances, MOHX_DEPS, lex_field=1)
def __init__(self): super().__init__() self.instances, self.words = [], [] lemmatizer = WordNetLemmatizer() cur_verb, cluster = "", "" for line in open(TROFI_LOCATION).readlines(): if re.match(r"\*\*\*[a-z]", line): cur_verb = line.split("***")[1] continue elif "*" in line or not line.strip(): if "literal" in line: cluster = "literal" elif "nonliteral" in line: cluster = "nonliteral" continue sentence = Corpus.Sentence() data = line.strip().split("\t") sentence.id = data[0] met = "" if "N" in data[1]: met = "met" if "L" in data[1]: met = "N" if "U" in data[1]: met = "?" for i in range(len(data[2].split())): word = data[2].split()[i] v_lem = lemmatizer.lemmatize(word, "v") cur_met = "N" if v_lem == cur_verb: cur_met = "tag-" + met w = Corpus.Word(text=word, met=cur_met, sentence=sentence, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence) Corpus.add_dependencies(self.instances, TROFI_DEPS, lex_field=1)
def __init__(self, lcc_instance_node): super().__init__() self.target_cm = [lcc_instance_node.get('targetConcept')] annotations_element = lcc_instance_node.find(".//Annotations") met_anns = annotations_element.find(".//MetaphoricityAnnotations") self.met_score = sum([float(m.get('score')) for m in met_anns]) / len(met_anns) cm_source_anns = annotations_element.find(".//CMSourceAnnotations") self.source_cm = [] if cm_source_anns is not None: self.source_cm = set([(cm.get("sourceConcept"), float(cm.get("score"))) for cm in cm_source_anns if float(cm.get('score')) >= 0]) self.chain = lcc_instance_node.get('chain') self.id = lcc_instance_node.get('id') all_text = lcc_instance_node.find(".//TextContent") self.current_text = all_text.find(".//Current") self.prev_text = all_text.find(".//Prev") self.next_text = all_text.find(".//Next") self.source_lm = self.current_text.find(".//LmSource").text.strip() self.target_lm = self.current_text.find(".//LmTarget").text.strip() i = 0 all_words = [] for word_group in self.current_text.itertext(): if word_group.strip() == self.source_lm: met = ["source", self.source_cm, self.met_score] elif word_group.strip() == self.target_lm: met = ["target", self.target_cm, self.met_score] else: met = ["N", "", ""] for w in [ w for w in re.findall(r"[\w']+|[.,?!;:\"']", word_group) if w != "=" ]: self.words.append( Corpus.Word(text=w, met=met, index=i, sentence=self)) i += 1
def __init__(self, corpus_location): self.instances, self.words = [], [] data = csv.reader(open(corpus_location)) next(data) for line in data: sentence = Corpus.Sentence() sentence.id = line[1] index = int(line[-2]) tag = int(line[-1]) sent_data = line[3].split() for i in range(len(sent_data)): word = sent_data[i] met = "N" if i == index: met = "met" w = Corpus.Word(text=word, sentence=sentence, met=met, index=i) sentence.words.append(w) self.words.append(w) self.instances.append(sentence)
def load_vuamc_csv(filename=VUAMC_CSV): with codecs.open(filename, encoding="latin-1", errors='replace') as f: data = [line for line in csv.reader(f)] sentences = [] all_words = [] for sent_index in range(1, len(data[1:])): line_data = data[sent_index] if not line_data: continue sentence = Corpus.Sentence() sentence.source_file = line_data[0] if sentence.source_file in ACADEMIC: sentence.domain = "academic" elif sentence.source_file in CONVERSATION: sentence.domain = "conversation" elif sentence.source_file in FICTION: sentence.domain = "fiction" elif sentence.source_file in NEWS: sentence.domain = "news" sentence.id = line_data[1] words = line_data[2] j = 0 for i in range(0, len(words.split())): w_data = words.split()[i].split(";;") if "M_" in w_data[-1]: met = "met" word_text = w_data[-1][2:] else: met = "N" word_text = w_data[-1] pos = w_data[0] lemma = w_data[1] for extra_words in word_text.split("_"): if not set(extra_words).intersection( str(string.punctuation + string.ascii_letters + string.digits)): sentence.words.append( Corpus.Word(text="none", met="none", pos="none", lemma="none", sentence=sentence, index=j)) j += 1 continue word = Corpus.Word(text=extra_words, met=met, pos=pos, lemma=lemma, sentence=sentence, index=j) sentence.words.append(word) all_words.append(word) j += 1 sentences.append(sentence) Corpus.add_dependencies(sentences, VUAMC_DEPS) Corpus.add_vn_parse(sentences, VUAMC_VN) Corpus.add_allen_parse(sentences, VUAMC_ALLEN) #Corpus.populate_vn_from_heads(sentences) return sentences, all_words