def extract_represent(self, sent_doc, subject, predicate, doc_api_name): statement_record_list = [] if sent_doc.text.find( ' represent') >= 0 and predicate.lemma_ == 'represent': subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj( sent_doc, predicate) neg = False for f in subtree_span: if f.dep_ == "neg": neg = not neg if neg: return statement_record_list object = DependencyTreeUtil.get_object_for_verb( sent_doc, predicate) if object is None: return statement_record_list info_from_set = set() info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.Ontology_IS_A_Relation, object.text, NPEntityType.CategoryType, NPEntityType.CategoryType, self.extractor_name, info_from_set) statement_record_list.append(relation_data_tuple) return statement_record_list
def extract_for_A_is_xxx(self, sent_doc, subject, predicate, doc_api_name): statement_record_list = [] attr_span = DependencyTreeUtil.get_attr_for_be_predicate( doc=sent_doc, predicate_token=predicate) neg = False subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj( sent_doc, predicate) for f in subtree_span: if f.dep_ == "neg": neg = not neg if attr_span is None: return statement_record_list attr_spans = DependencyTreeUtil.split_span_into_parallel( sent_doc, attr_span) for i, attr_span in enumerate(attr_spans): if attr_span is None or attr_span.text == "": continue try: if attr_span.root.pos_ == "NOUN" or attr_span.root.pos_ == 'PROPN': if neg: return statement_record_list noun_phase_doc = self.nlp(attr_span.text) noun_phase, feature_list = DependencyTreeUtil.split_large_noun_phase_span_to_adj_and_np( span=noun_phase_doc) relation = RelationNameConstant.Ontology_IS_A_Relation if not ( noun_phase.startswith('member of') or noun_phase.startswith('part of') ) else RelationNameConstant.Ontology_Derive_Relation if noun_phase.startswith('member of'): noun_phase = noun_phase.replace('member of', '', 1) if noun_phase.startswith('part of'): noun_phase = noun_phase.replace('part of', '', 1) info_from_set = set() info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category, sent_doc.text, doc_api_name)) category_name = noun_phase if category_name.lower().find("base class") >= 0: category_name += ( " " + DependencyTreeUtil.get_conditions_text_for_token( sent_doc, attr_span.root)) if relation != RelationNameConstant.Ontology_IS_A_Relation: return statement_record_list relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.Ontology_IS_A_Relation, category_name, NPEntityType.CategoryType, NPEntityType.CategoryType, self.extractor_name, info_from_set) statement_record_list.append(relation_data_tuple) except Exception as e: print(e) return statement_record_list
def __init__(self, api_from, full_doc, doc, predicate=None): self.api_from = api_from self.full_doc = full_doc self.doc = doc if predicate is None: self.predicate = DependencyTreeUtil.get_main_predicate(doc) else: self.predicate = predicate self.subject = DependencyTreeUtil.get_subject(doc)
def extract_for_A_is_xxx(self, sent_doc, subject, predicate, doc_api_name): # AE be [a/an] JJ+ NP (COND) statement_record_list = [] raw_attr_span = DependencyTreeUtil.get_attr_for_be_predicate( doc=sent_doc, predicate_token=predicate) neg = False subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj( sent_doc, predicate) for f in subtree_span: if f.dep_ == "neg": neg = not neg if raw_attr_span is None: return statement_record_list attr_span_list = self.new_split_span_into_parallel( sent_doc, raw_attr_span) for i, attr_span in enumerate(attr_span_list): if attr_span is None or attr_span.text == "": continue try: if attr_span.root.pos_ == "ADJ": feature_name = 'not ' + attr_span.root.text if neg else attr_span.root.text condition_text = DependencyTreeUtil.get_conditions_text_for_token( sent_doc, attr_span.root) if condition_text == '': condition_text = DependencyTreeUtil.get_conditions_text_for_token( sent_doc, predicate) extra_info = { "condition": condition_text, "core": attr_span.root.text, "leading_verb": attr_span.root.head.lemma_, "compare_subject": '', "compare_object": '', "neg": neg, } info_from_set = set() info_from_set.add( (ALLKnowledgeFromType.FROM_Text_Characteristic, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.has_Feature_Relation, feature_name + ' ' + condition_text, NPEntityType.CategoryType, NPEntityType.CharacteristicType, self.extractor_name, info_from_set, **extra_info) statement_record_list.append(relation_data_tuple) except Exception as e: print(e) return statement_record_list
def extract_for_A_could_be_xxx(self, sent_doc, subject, predicate, doc_api_name): statement_record_list = [] core_feature = DependencyTreeUtil.get_can_be_string( sent_doc, predicate) if core_feature is None: return statement_record_list feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_filter_another_token_sub_tree( sent_doc, subject.root, predicate) if feature_span is None or feature_span.end_char != 0 or feature_span.text == '': return statement_record_list neg = False for f in feature_span: if f.dep_ == "neg": neg = not neg condition_text = DependencyTreeUtil.get_conditions_text_for_token( sent_doc, feature_span.root) feature_span_swap_text = DependencyTreeUtil.swap_condition_to_end( feature_span, predicate) span_two = feature_span_swap_text.split('$$') if len(span_two) > 1: second = span_two[1].replace(subject.text, '', 1).strip() feature_span_swap_text = span_two[0] + '' + second feature_span_swap_text.strip() extra_info = { "condition": condition_text, "core": core_feature, "leading_verb": predicate.lemma_, "neg": neg, "compare_subject": '', "compare_object": '', } info_from_set = set() info_from_set.add((ALLKnowledgeFromType.FROM_Text_Characteristic, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.has_Feature_Relation, feature_span_swap_text, NPEntityType.CategoryType, NPEntityType.CharacteristicType, self.extractor_name, info_from_set, **extra_info) statement_record_list.append(relation_data_tuple) # feature_relation_tuple_result = self.get_feature_for_category( # sent_doc, subject, predicate, doc_api_name) # statement_record_list.extend(feature_relation_tuple_result) return statement_record_list
def is_A_be_jj_np_template(self, sent_doc, subject, predicate): if predicate.pos_ == "VERB" and predicate.tag_ == "VBN": core_feature = DependencyTreeUtil.get_can_be_string( sent_doc, predicate) if core_feature is not None: return True return False
def new_split_span_into_parallel(self, doc, span): sub_doc = self.self_doc(span.text) find_token_set = set() span_list = [] for t in sub_doc: if t.dep_ == "appos" or t.dep_ == "cc": find_token_set.add(t.head) span_list.append( DependencyTreeUtil.get_left_subtree_and_itself_span( doc, t.head)) for find_token in find_token_set: for child in find_token.rights: if child.tag_ != "CC": span_list.append( DependencyTreeUtil.get_left_subtree_and_itself_span( doc, child)) if len(find_token_set) == 0: span_list.append(span) return span_list
def extract_belong_to_like_category(self, sent_doc, subject, predicate, doc_api_name): # have + belong to statement_record_list = [] if re.search(r'belong[a-z]* to', sent_doc.text) and predicate.lemma_ == 'belong': subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj( sent_doc, predicate) neg = False for f in subtree_span: if f.dep_ == "neg": neg = not neg if neg: return statement_record_list object = DependencyTreeUtil.get_object_for_verb( sent_doc, predicate) prep_object = DependencyTreeUtil.get_object_for_preposition( sent_doc, predicate) if object or prep_object: object = object if object else prep_object else: return statement_record_list info_from_set = set() info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.Ontology_Derive_Relation, object.text, NPEntityType.CategoryType, NPEntityType.CategoryType, self.extractor_name, info_from_set) statement_record_list.append(relation_data_tuple) if predicate.lemma_ == 'have' and not predicate.dep_.startswith('aux'): subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj( sent_doc, predicate) neg = False for f in subtree_span: if f.dep_ == "neg": neg = not neg if neg: return statement_record_list object = DependencyTreeUtil.get_object_for_verb( sent_doc, predicate) prep_object = DependencyTreeUtil.get_object_for_preposition( sent_doc, predicate) if object or prep_object: object = object if object else prep_object else: return statement_record_list info_from_set = set() info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.Ontology_Derive_Relation, object.text, NPEntityType.CategoryType, NPEntityType.CategoryType, self.extractor_name, info_from_set) statement_record_list.append(relation_data_tuple) return statement_record_list
def print_nlp_analysis(sent_doc): np_chunk_detail = [chunk for chunk in sent_doc.noun_chunks] print("np_chunk_detail for sentence", np_chunk_detail) SEP = " - " for chunk in sent_doc.noun_chunks: print(chunk.text, SEP, chunk.root.text, SEP, chunk.root.dep_, SEP, chunk.root.head.text) print("----chunk detail") for chunk in sent_doc.noun_chunks: for token in chunk: print(token.text, SEP, token.pos_, SEP, token.tag_, SEP) print("-----------end chunk print----------") for token in sent_doc: print( token.text, SEP, token.pos_, SEP, token.tag_, SEP, token.dep_, SEP, token.head.text, SEP, token.head.pos_, SEP, [child for child in token.children], SEP, [child for child in token.lefts], SEP, [child for child in token.rights], ) print("-----------end tree print----------") print("-----------subtree----------") print("subject of is:", DependencyTreeUtil.get_subject(doc=sent_doc), DependencyTreeUtil.get_subject_text(sent_doc)) print("predicate is:", DependencyTreeUtil.get_main_predicate(doc=sent_doc)) print("-----------end subtree----------")
def create_sentence_list(self, text, api_from=""): simple_sentence_list = [] text = ' '.join(text.split()) text = text.replace("is A ", "is a ") if not self.check_api_related(text): return simple_sentence_list text = self.modify_api_qualified_name(text, api_from) doc = self.self_doc(text) for sent_doc in doc.sents: sent_doc_list = self.clause_extraction(sent_doc.as_doc()) for sent_doc_l in sent_doc_list: sent_doc_t = DependencyTreeUtil.merge_np_of_np(sent_doc_l) simple_sentence = SimpleSentence(api_from, doc, sent_doc_t) if self.check_simple_sentence_valid(simple_sentence): simple_sentence_list.append(simple_sentence) return simple_sentence_list
def extract_simple_sentence(self, simple_sentence: SimpleSentence): doc = simple_sentence.get_doc() full_doc = simple_sentence.get_full_doc() api_name = simple_sentence.api_from main_predicate = DependencyTreeUtil.get_main_predicate(full_doc)
def clause_extraction(self, sent_doc): predicate = DependencyTreeUtil.get_main_predicate(sent_doc) subject = DependencyTreeUtil.get_subject(sent_doc) sent_doc_list = [] if predicate is None or subject is None: return sent_doc_list for i, token in enumerate(sent_doc): if token.dep_ == 'conj' and token.pos_ == 'VERB' and token.head == predicate: # added = True conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index( sent_doc, i) is_complete = False # 判断是否有主语 for nsubj_token in conj_predicate_span: if nsubj_token.dep_.startswith('nsubj'): is_complete = True break if is_complete: sent_doc_list.append( self.self_doc(conj_predicate_span.text)) else: clause_text = subject.text + ' ' + conj_predicate_span.text sent_doc_list.append(self.self_doc(clause_text)) sent_doc_list.append( self.self_doc( DependencyTreeUtil. get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence( sent_doc, token, predicate).text)) if sent_doc_list[1] is None or sent_doc_list[ 0].text == sent_doc_list[1].text: return list(sent_doc_list[0]) if sent_doc_list[0].text == sent_doc.text or sent_doc_list[ 1] == sent_doc.text: return sent_doc_list final_sent_list = [] for sent_doc_l in sent_doc_list: final_sent_list.extend(self.clause_extraction(sent_doc_l)) return final_sent_list if token.dep_ == 'relcl' and token.head.pos_ == 'NOUN': # added = True conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index( sent_doc, i) that_flag = False for index, conj_token in enumerate(conj_predicate_span): if (conj_token.tag_ == 'WDT' or conj_token.tag_ == "DT") and (conj_token.text == 'that' or conj_token.text == 'which'): that_flag = True if index > 1: text = token.head.text + ' ' + conj_predicate_span[ 0:index - 1].text + ' ' + conj_predicate_span[ index + 1:len(conj_predicate_span) + 1].text else: text = token.head.text + ' ' + conj_predicate_span[ index + 1:len(conj_predicate_span) + 1].text sent_doc_list.append(self.self_doc(text)) if not that_flag: text_all = token.head.text + ' ' + conj_predicate_span.text sent_doc_list.append(self.self_doc(text_all)) sent_doc_list.append( self.self_doc( DependencyTreeUtil. get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence( sent_doc, token, predicate).text)) if sent_doc_list[1] is None or sent_doc_list[ 0].text == sent_doc_list[1].text: return list(sent_doc_list[0]) if sent_doc_list[0].text == sent_doc.text or sent_doc_list[ 1] == sent_doc.text: return sent_doc_list final_sent_list = [] for sent_doc_l in sent_doc_list: final_sent_list.extend(self.clause_extraction(sent_doc_l)) return final_sent_list if token.dep_ == 'mark' and token.tag_ == 'IN' and token.lemma_.lower( ) not in self.mark_filter_words: # added = True conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index( sent_doc, token.head.i) for index, conj_token in enumerate(conj_predicate_span): if conj_token.dep_ == 'mark': if index < len(conj_predicate_span ) - 1 and conj_predicate_span[ index + 1].dep_ == 'mark': continue else: conj_predicate_span = conj_predicate_span[index + 1:] break if index < len( conj_predicate_span) - 1 and conj_predicate_span[ index + 1].dep_ == 'mark': continue doc1 = self.self_doc(conj_predicate_span.text) doc2 = DependencyTreeUtil.get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence( sent_doc, token.head, predicate) if doc2 is None: sent_doc_list.append(doc1) return sent_doc_list doc2 = self.self_doc(doc2.text) if doc1.text == doc2.text: sent_doc_list.append(doc1) return sent_doc_list sent_doc_list.append(doc1) sent_doc_list.append(doc2) if doc1.text == sent_doc.text or doc2.text == sent_doc.text: return sent_doc_list final_sent_list = [] for sent_doc_l in sent_doc_list: final_sent_list.extend(self.clause_extraction(sent_doc_l)) return final_sent_list # for mark_token in conj_predicate_span: # if mark_token.text != 'if': sent_doc_list.append(sent_doc) return sent_doc_list
def self_doc(self, text): doc = self.nlp(text) doc = DependencyTreeUtil.merge_np_chunks(doc) doc = DependencyTreeUtil.merge_np_of_np(doc) return doc
def get_feature_for_functionality(self, sent_doc, subject, predicate, doc_api_name): """ 某个功能有什么特征 """ # buffering character statement_record_list = [] feature_name = '' for i, token in enumerate(sent_doc): if token.pos_ == 'ADV' and token.tag_ != 'WRB' and token.lemma_.lower( ) not in self.stop_feature and token.head == predicate: feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj_by_index( sent_doc, token, i) if feature_span is None: return statement_record_list, feature_name feature_name = feature_span.text neg = False for f in feature_span: if f.dep_ == "neg": neg = not neg condition_text = DependencyTreeUtil.get_conditions_text_for_token( sent_doc, feature_span.root) extra_info = { "condition": condition_text, "core": feature_span.text, "leading_verb": predicate.lemma_, "neg": neg, "compare_subject": '', "compare_object": '', } info_from_set = set() info_from_set.add( (ALLKnowledgeFromType.FROM_Text_Characteristic, sent_doc.text, doc_api_name)) start_name = " ".join( sent_doc.text.replace(feature_name, "", 1).split()) relation_data_tuple = StatementRecord( start_name, RelationNameConstant.has_Feature_Relation, feature_name, NPEntityType.FunctionalityType, NPEntityType.CharacteristicType, self.extractor_name, info_from_set, **extra_info) statement_record_list.append(relation_data_tuple) if token.dep_ == 'advcl' and token.head == predicate: ifflag = False if token is None: continue for sub_token in token.lefts: if sub_token.lemma_.lower() in self.mark_filter_words: ifflag = True if ifflag: continue feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj_by_index( sent_doc, token, i) if feature_span is None: return statement_record_list, feature_name if token.left_edge.text in self.stop_feature: return statement_record_list, feature_name if token.left_edge.text == 'except' and token.left_edge.dep_ == 'mark': clause_flag = False clause = [] for span_token in feature_span: if span_token.dep_.startswith('nsubj'): clause_flag = True if span_token.dep_ != 'mark': clause.append(span_token.text) if clause_flag: clause_doc = self.nlp(' '.join(clause)) clause_doc = DependencyTreeUtil.merge_np_chunks( clause_doc) clause_doc = DependencyTreeUtil.merge_np_of_np( clause_doc) clause_relation_tuple_result = self.extract_from_text( clause_doc.text, doc_api_name) statement_record_list.extend( clause_relation_tuple_result) feature_name = feature_span.text neg = False for f in feature_span: if f.dep_ == "neg": neg = not neg condition_text = DependencyTreeUtil.get_conditions_text_for_token( sent_doc, feature_span.root) lead_verb = "" for token in feature_span: if token.head == token: lead_verb = token extra_info = { "condition": condition_text, "core": feature_span.text, "leading_verb": lead_verb.lemma_, "neg": neg, "compare_subject": '', "compare_object": '', } feature_name = feature_name.replace(feature_span.root.text, feature_span.root.lemma_) info_from_set = set() info_from_set.add( (ALLKnowledgeFromType.FROM_Text_Characteristic, sent_doc.text, doc_api_name)) relation_data_tuple = StatementRecord( subject.text, RelationNameConstant.has_Feature_Relation, feature_name, NPEntityType.CategoryType, NPEntityType.CharacteristicType, self.extractor_name, info_from_set, **extra_info) statement_record_list.append(relation_data_tuple) return statement_record_list