def merge_NT_Arg(Arg_list, parse_dict, DocID, sent_index): punctuation = """!"#&'*+,-..../:;<=>?@[\]^_`|~""" + "``" + "''" if len(Arg_list) <= 1: return Arg_list temp = [] # scan the missing parts, if it is the punctuation, then make up for i, item in enumerate(Arg_list): if i <= len(Arg_list) - 2: temp.append(item) next_item = Arg_list[i + 1] if next_item - item > 1: flag = 1 for j in range(item + 1, next_item): if parse_dict[DocID]["sentences"][sent_index]["words"][j][0] not in punctuation: flag = 0 break if flag == 1:# make up temp += range(item + 1, next_item) temp.append(Arg_list[-1]) Arg = [(index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in temp] # remove the leading or tailing punctuations Arg = util.list_strip_punctuation(Arg) Arg = [item[0] for item in Arg] return Arg
def _sent_clauses(doc, sent_index): relation_ID = -1 token_indices = range(len(doc["sentences"][sent_index]["words"])) sent_tokens = [(index, doc["sentences"][sent_index]["words"][index][0]) for index in token_indices] punctuation = "...,:;?!~--" # first, use punctuation symbols to split the sentence _clause_indices_list = []#[[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] if temp != []: _clause_indices_list.append(temp) clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # then use SBAR tag in its parse tree to split each part into clauses. parse_tree = doc["sentences"][sent_index]["parsetree"].strip() syntax_tree = ETESyntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree = _get_subtree(syntax_tree, clause_indices) # BFS flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2 [0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) clauses = []# [([1,2,3],yes), ([4, 5],no), ] for clause_indices in clause_list: clauses.append((clause_indices, "")) return Arg_Clauses(relation_ID, 'Sentence', sent_index, clauses)
def get_sent_clauses(parse_dict, DocID, sent_index): sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"]) sent_tokens = [ (index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in range(0, sent_length) ] punctuation = "...,:;?!~--" # 先按标点符号分 _clause_indices_list = [] # [[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # 继续细化,根据语法树, 第一个SBAR parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree = _get_subtree(syntax_tree, clause_indices) # 层次遍历, flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2[0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) return clause_list
def get_sent_clauses(parse_dict, DocID, sent_index): sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"]) sent_tokens = [(index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in range(0, sent_length)] punctuation = "...,:;?!~--" # first, use punctuation symbols to split the sentence _clause_indices_list = []#[[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # then use SBAR tag in its parse tree to split each part into clauses. parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree = _get_subtree(syntax_tree, clause_indices) flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2 [0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) return clause_list
def _ps_arg2_clauses(parse_dict, relation, Arg): DocID = relation["DocID"] relation_ID = relation["ID"] sent_index = relation[Arg]["TokenList"][0][3] sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"]) sent_tokens = [ (index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in range(0, sent_length) ] # first, split the sentence by the connective and the punctuation symbols conn_token_indices = [ item[4] for item in relation["Connective"]["TokenList"] ] punctuation = "...,:;?!~--" _clause_indices_list = [] # [[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation and index not in conn_token_indices: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # then use SBAR tag in its parse tree to split each part into clauses. parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip( ) syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree = parser_util._get_subtree(syntax_tree, clause_indices) # BFS flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2[0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) # print " ".join([parse_dict[DocID]["sentences"][sent_index]["words"][index][0] for index in range(sent_length)]) # print clause_list # print Arg_list clauses = [] # [([1,2,3],yes), ([4, 5],no), ] for clause_indices in clause_list: clauses.append((clause_indices, "")) return clauses
def _ps_arg2_clauses(parse_dict, relation, Arg): DocID = relation["DocID"] relation_ID = relation["ID"] sent_index = relation[Arg]["TokenList"][0][3] sent_length = len(parse_dict[DocID]["sentences"][sent_index]["words"]) sent_tokens = [(index, parse_dict[DocID]["sentences"][sent_index]["words"][index][0]) for index in range(0, sent_length)] # first, split the sentence by the connective and the punctuation symbols conn_token_indices = [item[4] for item in relation["Connective"]["TokenList"]] punctuation = "...,:;?!~--" _clause_indices_list = []#[[(1,"I")..], ..] temp = [] for index, word in sent_tokens: if word not in punctuation and index not in conn_token_indices: temp.append((index, word)) else: if temp != []: _clause_indices_list.append(temp) temp = [] clause_indices_list = [] for clause_indices in _clause_indices_list: temp = util.list_strip_punctuation(clause_indices) if temp != []: clause_indices_list.append([item[0] for item in temp]) # then use SBAR tag in its parse tree to split each part into clauses. parse_tree = parse_dict[DocID]["sentences"][sent_index]["parsetree"].strip() syntax_tree = Syntax_tree(parse_tree) if syntax_tree.tree == None: return [] clause_list = [] for clause_indices in clause_indices_list: clause_tree =parser_util._get_subtree(syntax_tree, clause_indices) # BFS flag = 0 for node in clause_tree.tree.traverse(strategy="levelorder"): if node.name == "SBAR": temp1 = [node.index for node in node.get_leaves()] temp2 = sorted(list(set(clause_indices) - set(temp1))) if temp2 == []: clause_list.append(temp1) else: if temp1[0] < temp2 [0]: clause_list.append(temp1) clause_list.append(temp2) else: clause_list.append(temp2) clause_list.append(temp1) flag = 1 break if flag == 0: clause_list.append(clause_indices) # print " ".join([parse_dict[DocID]["sentences"][sent_index]["words"][index][0] for index in range(sent_length)]) # print clause_list # print Arg_list clauses = []# [([1,2,3],yes), ([4, 5],no), ] for clause_indices in clause_list: clauses.append((clause_indices, "")) return clauses
def _non_explicit_Arg_offset_in_sent(doc, sent_index): curr_length = len(doc["sentences"][sent_index]["words"]) Arg = [(index, doc["sentences"][sent_index]["words"][index][0]) for index in range(0, curr_length)] Arg = util.list_strip_punctuation(Arg) Arg = [item[0] for item in Arg] return Arg