def process_paracl(self, sentence, idx): tree = sentence.tree parent = tree.parents[idx] subtree = sorted(tree.get_subtree(idx)) clause = [ tree.words[node] for node in subtree if tree.dependent_relation(idx, node) != 'mark' ] # drop mark word conjunction = tree.get_conjunction(parent) conjunction.insert(0, parent) conjunction = tree.build_conjunction(conjunction) for i, token in enumerate(conjunction): # if token.idx + 1 < idx: clause.insert(i, token) if tree.in_coming_relation(idx) == "nmod": for child in tree.children(idx): if tree.dependent_relation( idx, child) == "case" and tree.words[child].pos == "VBG": idx = child break if tree.words[idx].pos == "VBG": tree.words[idx].word = stem(tree.words[idx].word) else: clause.insert(len(conjunction), Token("be", "VBZ", -1, -1)) sent = self.sentence_builder.from_un_parsed_tokens(clause) message = ClauseMessage(clause_detection.ACL, None, None) sentence.left.append((message, sent)) tree.delete_subtree(subtree)
def compound_first(self, sentence, restructurer): tokens = tokenize(sentence) tokens = [Token(token, None, i, i) for i, token in enumerate(tokens)] sent = self.builder.from_un_parsed_tokens(tokens) clauses = list(self.decompose_compound(sent)) if len(clauses) > 1: for clause in clauses: sent = self.builder.from_un_parsed_tokens(clause) for clause in self.appositive_first(sent, restructurer): yield clause
def appositive_first(self, sentence, restructurer): """ :param sentence: raw sentence :param restructurer: :return: """ tokens = tokenize(sentence) tokens = [Token(token, None, i, i) for i, token in enumerate(tokens)] sent = self.builder.from_un_parsed_tokens(tokens) restructurer.apposition_first(sent) for sub_sent in sent.iter_subsentence(): for clause in self.decompose_compound(sub_sent): yield clause
def extract_appositive_clause(self, tree, idx, normal=True): subtree = sorted(tree.get_subtree(idx)) clause = [tree.words[node] for node in subtree] if normal: conjunction = tree.get_conjunction(tree.parents[idx]) conjunction.insert(0, tree.parents[idx]) conjunction = tree.build_conjunction(conjunction) n = 0 for i, token in enumerate(conjunction): if token.idx + 1 < idx: n += 1 clause.insert(i, token) clause.insert(n, Token("be", "VBZ", -1, -1)) else: # get the nearest noun flag = False for i in range(idx - 1, 0, -1): if i in tree.words: if tree.words[i].word == ",": flag = True elif tree.words[i].pos.startswith("NN") and flag: clause.insert(0, Token("be", "VBZ", -1, -1)) clause.insert(0, tree.words[i]) break return subtree, clause
def _parseline(self, line): """ Parse one line from *.merge file """ items = line.split("\t") tok = Token() tok.sidx, tok.tidx = int(items[0]), int(items[1]) # Without changing the case tok.word, tok.lemma = items[2], items[3] tok.pos = items[4] tok.deplabel = items[5] try: tok.hidx = int(items[6]) except ValueError: pass tok.ner, tok.partialparse = items[7], items[8] try: tok.eduidx = int(items[9]) except ValueError: print tok.word, self.fmerge # sys.exit() pass return tok
def process_noun_adjective_phrase(self, sentence, idx): tree = sentence.tree parent = tree.parents[idx] subtree = sorted(tree.get_subtree(idx)) clause = [tree.words[node] for node in subtree] clause[0].word = clause[0].word.lower() # lower case subjects = tree.get_subjects(parent) if not subjects: return conjunction = tree.build_conjunction(subjects) for i, token in enumerate(conjunction): clause.insert(i, token) clause.insert(len(conjunction), Token("be", "VBZ", -1, -1)) sent = self.sentence_builder.from_un_parsed_tokens(clause) message = ClauseMessage(clause_detection.ADVCL, None, None) sentence.left.append((message, sent)) tree.delete_subtree(subtree)
def process_paradvcl(self, sentence, idx): tree = sentence.tree parent = tree.parents[idx] subtree = sorted(tree.get_subtree(idx)) clause = [ tree.words[node] for node in subtree if tree.dependent_relation(idx, node) != 'mark' ] # drop mark word clause[0].word = clause[0].word.lower() # lower case mark = None if subtree[0] != idx: mark = clause[0].word clause.pop(0) subjects = tree.get_subjects(parent) subjects = tree.build_conjunction(subjects) if tree.words[idx].pos == clause_detection.VBN: clause.insert(0, Token("be", "VBZ", -1, -1)) for i, token in enumerate(subjects): clause.insert(i, token) else: tree.words[idx].word = stem(tree.words[idx].word) objects = tree.get_objects(parent) if not objects: for i, token in enumerate(subjects): clause.insert(i, token) elif idx - 1 in tree.words and tree.words[ idx - 1].pos == clause_detection.PREP: token = tree.words[idx - 1] if token.word == "by": for i, token in enumerate(subjects): clause.insert(i, token) else: objects = tree.build_conjunction(objects) for i, token in enumerate(objects): clause.insert(i, token) else: for i, token in enumerate(subjects): clause.insert(i, token) sent = self.sentence_builder.from_un_parsed_tokens(clause) message = ClauseMessage(clause_detection.ADVCL, mark, tree.words[parent]) sentence.left.append((message, sent)) tree.delete_subtree(subtree)
def lexical_simplification_first(self, sentence, restructurer): """ :param sentence: raw sentence :param restructurer: :return: """ original_tokens, tokens, chunks = extract_noun_chunks(sentence) tokens = [ Token(token[0], None, i, token[1]) for i, token in enumerate(tokens) ] sentence = self.builder.from_un_parsed_tokens(tokens, False) # print(" ".join([token.word for token in sentence.tree.words.values()])) restructurer.extract_appositive(sentence) nmods = sentence.tree.get_noun_nmods() sentence = self.reparse(sentence, False) adverbials = sentence.tree.get_extra_adverbial() sentence = self.reparse(sentence, False) restructurer.restructure(sentence) clauses = [] for sub_sent in sentence.iter_subsentence(): for clause in self.decompose_compound(sub_sent): clauses.append(clause) return original_tokens, chunks, nmods, adverbials, clauses