Пример #1
0
    def extract_represent(self, sent_doc, subject, predicate, doc_api_name):
        statement_record_list = []
        if sent_doc.text.find(
                ' represent') >= 0 and predicate.lemma_ == 'represent':
            subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj(
                sent_doc, predicate)
            neg = False
            for f in subtree_span:
                if f.dep_ == "neg":
                    neg = not neg
            if neg:
                return statement_record_list
            object = DependencyTreeUtil.get_object_for_verb(
                sent_doc, predicate)
            if object is None:
                return statement_record_list
            info_from_set = set()
            info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category,
                               sent_doc.text, doc_api_name))

            relation_data_tuple = StatementRecord(
                subject.text, RelationNameConstant.Ontology_IS_A_Relation,
                object.text, NPEntityType.CategoryType,
                NPEntityType.CategoryType, self.extractor_name, info_from_set)
            statement_record_list.append(relation_data_tuple)

        return statement_record_list
Пример #2
0
    def extract_for_A_is_xxx(self, sent_doc, subject, predicate, doc_api_name):
        statement_record_list = []

        attr_span = DependencyTreeUtil.get_attr_for_be_predicate(
            doc=sent_doc, predicate_token=predicate)
        neg = False
        subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj(
            sent_doc, predicate)
        for f in subtree_span:
            if f.dep_ == "neg":
                neg = not neg
        if attr_span is None:
            return statement_record_list

        attr_spans = DependencyTreeUtil.split_span_into_parallel(
            sent_doc, attr_span)
        for i, attr_span in enumerate(attr_spans):
            if attr_span is None or attr_span.text == "":
                continue
            try:
                if attr_span.root.pos_ == "NOUN" or attr_span.root.pos_ == 'PROPN':
                    if neg:
                        return statement_record_list
                    noun_phase_doc = self.nlp(attr_span.text)
                    noun_phase, feature_list = DependencyTreeUtil.split_large_noun_phase_span_to_adj_and_np(
                        span=noun_phase_doc)
                    relation = RelationNameConstant.Ontology_IS_A_Relation if not (
                        noun_phase.startswith('member of')
                        or noun_phase.startswith('part of')
                    ) else RelationNameConstant.Ontology_Derive_Relation
                    if noun_phase.startswith('member of'):
                        noun_phase = noun_phase.replace('member of', '', 1)
                    if noun_phase.startswith('part of'):
                        noun_phase = noun_phase.replace('part of', '', 1)

                    info_from_set = set()
                    info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category,
                                       sent_doc.text, doc_api_name))
                    category_name = noun_phase
                    if category_name.lower().find("base class") >= 0:
                        category_name += (
                            " " +
                            DependencyTreeUtil.get_conditions_text_for_token(
                                sent_doc, attr_span.root))

                    if relation != RelationNameConstant.Ontology_IS_A_Relation:
                        return statement_record_list

                    relation_data_tuple = StatementRecord(
                        subject.text,
                        RelationNameConstant.Ontology_IS_A_Relation,
                        category_name, NPEntityType.CategoryType,
                        NPEntityType.CategoryType, self.extractor_name,
                        info_from_set)
                    statement_record_list.append(relation_data_tuple)
            except Exception as e:
                print(e)
        return statement_record_list
Пример #3
0
    def __init__(self, api_from, full_doc, doc, predicate=None):
        self.api_from = api_from
        self.full_doc = full_doc
        self.doc = doc
        if predicate is None:
            self.predicate = DependencyTreeUtil.get_main_predicate(doc)
        else:
            self.predicate = predicate

        self.subject = DependencyTreeUtil.get_subject(doc)
Пример #4
0
    def extract_for_A_is_xxx(self, sent_doc, subject, predicate, doc_api_name):
        # AE be [a/an] JJ+ NP (COND)
        statement_record_list = []

        raw_attr_span = DependencyTreeUtil.get_attr_for_be_predicate(
            doc=sent_doc, predicate_token=predicate)
        neg = False
        subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj(
            sent_doc, predicate)
        for f in subtree_span:
            if f.dep_ == "neg":
                neg = not neg
        if raw_attr_span is None:
            return statement_record_list
        attr_span_list = self.new_split_span_into_parallel(
            sent_doc, raw_attr_span)
        for i, attr_span in enumerate(attr_span_list):
            if attr_span is None or attr_span.text == "":
                continue
            try:
                if attr_span.root.pos_ == "ADJ":
                    feature_name = 'not ' + attr_span.root.text if neg else attr_span.root.text
                    condition_text = DependencyTreeUtil.get_conditions_text_for_token(
                        sent_doc, attr_span.root)
                    if condition_text == '':
                        condition_text = DependencyTreeUtil.get_conditions_text_for_token(
                            sent_doc, predicate)
                    extra_info = {
                        "condition": condition_text,
                        "core": attr_span.root.text,
                        "leading_verb": attr_span.root.head.lemma_,
                        "compare_subject": '',
                        "compare_object": '',
                        "neg": neg,
                    }

                    info_from_set = set()
                    info_from_set.add(
                        (ALLKnowledgeFromType.FROM_Text_Characteristic,
                         sent_doc.text, doc_api_name))
                    relation_data_tuple = StatementRecord(
                        subject.text,
                        RelationNameConstant.has_Feature_Relation,
                        feature_name + ' ' + condition_text,
                        NPEntityType.CategoryType,
                        NPEntityType.CharacteristicType, self.extractor_name,
                        info_from_set, **extra_info)

                    statement_record_list.append(relation_data_tuple)

            except Exception as e:
                print(e)
        return statement_record_list
Пример #5
0
    def extract_for_A_could_be_xxx(self, sent_doc, subject, predicate,
                                   doc_api_name):
        statement_record_list = []
        core_feature = DependencyTreeUtil.get_can_be_string(
            sent_doc, predicate)
        if core_feature is None:
            return statement_record_list
        feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_filter_another_token_sub_tree(
            sent_doc, subject.root, predicate)
        if feature_span is None or feature_span.end_char != 0 or feature_span.text == '':
            return statement_record_list

        neg = False
        for f in feature_span:
            if f.dep_ == "neg":
                neg = not neg
        condition_text = DependencyTreeUtil.get_conditions_text_for_token(
            sent_doc, feature_span.root)
        feature_span_swap_text = DependencyTreeUtil.swap_condition_to_end(
            feature_span, predicate)
        span_two = feature_span_swap_text.split('$$')
        if len(span_two) > 1:
            second = span_two[1].replace(subject.text, '', 1).strip()
            feature_span_swap_text = span_two[0] + '' + second
        feature_span_swap_text.strip()
        extra_info = {
            "condition": condition_text,
            "core": core_feature,
            "leading_verb": predicate.lemma_,
            "neg": neg,
            "compare_subject": '',
            "compare_object": '',
        }

        info_from_set = set()
        info_from_set.add((ALLKnowledgeFromType.FROM_Text_Characteristic,
                           sent_doc.text, doc_api_name))

        relation_data_tuple = StatementRecord(
            subject.text, RelationNameConstant.has_Feature_Relation,
            feature_span_swap_text, NPEntityType.CategoryType,
            NPEntityType.CharacteristicType, self.extractor_name,
            info_from_set, **extra_info)

        statement_record_list.append(relation_data_tuple)
        # feature_relation_tuple_result = self.get_feature_for_category(
        #     sent_doc, subject, predicate, doc_api_name)
        # statement_record_list.extend(feature_relation_tuple_result)
        return statement_record_list
Пример #6
0
 def is_A_be_jj_np_template(self, sent_doc, subject, predicate):
     if predicate.pos_ == "VERB" and predicate.tag_ == "VBN":
         core_feature = DependencyTreeUtil.get_can_be_string(
             sent_doc, predicate)
         if core_feature is not None:
             return True
     return False
Пример #7
0
 def new_split_span_into_parallel(self, doc, span):
     sub_doc = self.self_doc(span.text)
     find_token_set = set()
     span_list = []
     for t in sub_doc:
         if t.dep_ == "appos" or t.dep_ == "cc":
             find_token_set.add(t.head)
             span_list.append(
                 DependencyTreeUtil.get_left_subtree_and_itself_span(
                     doc, t.head))
     for find_token in find_token_set:
         for child in find_token.rights:
             if child.tag_ != "CC":
                 span_list.append(
                     DependencyTreeUtil.get_left_subtree_and_itself_span(
                         doc, child))
     if len(find_token_set) == 0:
         span_list.append(span)
     return span_list
    def extract_belong_to_like_category(self, sent_doc, subject, predicate,
                                        doc_api_name):
        # have + belong to
        statement_record_list = []
        if re.search(r'belong[a-z]* to',
                     sent_doc.text) and predicate.lemma_ == 'belong':
            subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj(
                sent_doc, predicate)
            neg = False
            for f in subtree_span:
                if f.dep_ == "neg":
                    neg = not neg
            if neg:
                return statement_record_list
            object = DependencyTreeUtil.get_object_for_verb(
                sent_doc, predicate)
            prep_object = DependencyTreeUtil.get_object_for_preposition(
                sent_doc, predicate)
            if object or prep_object:
                object = object if object else prep_object
            else:
                return statement_record_list
            info_from_set = set()
            info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category,
                               sent_doc.text, doc_api_name))

            relation_data_tuple = StatementRecord(
                subject.text, RelationNameConstant.Ontology_Derive_Relation,
                object.text, NPEntityType.CategoryType,
                NPEntityType.CategoryType, self.extractor_name, info_from_set)
            statement_record_list.append(relation_data_tuple)
        if predicate.lemma_ == 'have' and not predicate.dep_.startswith('aux'):
            subtree_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj(
                sent_doc, predicate)
            neg = False
            for f in subtree_span:
                if f.dep_ == "neg":
                    neg = not neg
            if neg:
                return statement_record_list
            object = DependencyTreeUtil.get_object_for_verb(
                sent_doc, predicate)
            prep_object = DependencyTreeUtil.get_object_for_preposition(
                sent_doc, predicate)
            if object or prep_object:
                object = object if object else prep_object
            else:
                return statement_record_list
            info_from_set = set()
            info_from_set.add((ALLKnowledgeFromType.FROM_Text_Category,
                               sent_doc.text, doc_api_name))

            relation_data_tuple = StatementRecord(
                subject.text, RelationNameConstant.Ontology_Derive_Relation,
                object.text, NPEntityType.CategoryType,
                NPEntityType.CategoryType, self.extractor_name, info_from_set)
            statement_record_list.append(relation_data_tuple)

        return statement_record_list
Пример #9
0
 def print_nlp_analysis(sent_doc):
     np_chunk_detail = [chunk for chunk in sent_doc.noun_chunks]
     print("np_chunk_detail for sentence", np_chunk_detail)
     SEP = " - "
     for chunk in sent_doc.noun_chunks:
         print(chunk.text, SEP, chunk.root.text, SEP, chunk.root.dep_, SEP,
               chunk.root.head.text)
     print("----chunk detail")
     for chunk in sent_doc.noun_chunks:
         for token in chunk:
             print(token.text, SEP, token.pos_, SEP, token.tag_, SEP)
     print("-----------end chunk print----------")
     for token in sent_doc:
         print(
             token.text,
             SEP,
             token.pos_,
             SEP,
             token.tag_,
             SEP,
             token.dep_,
             SEP,
             token.head.text,
             SEP,
             token.head.pos_,
             SEP,
             [child for child in token.children],
             SEP,
             [child for child in token.lefts],
             SEP,
             [child for child in token.rights],
         )
     print("-----------end tree print----------")
     print("-----------subtree----------")
     print("subject of is:", DependencyTreeUtil.get_subject(doc=sent_doc),
           DependencyTreeUtil.get_subject_text(sent_doc))
     print("predicate is:",
           DependencyTreeUtil.get_main_predicate(doc=sent_doc))
     print("-----------end subtree----------")
Пример #10
0
 def create_sentence_list(self, text, api_from=""):
     simple_sentence_list = []
     text = ' '.join(text.split())
     text = text.replace("is A ", "is a ")
     if not self.check_api_related(text):
         return simple_sentence_list
     text = self.modify_api_qualified_name(text, api_from)
     doc = self.self_doc(text)
     for sent_doc in doc.sents:
         sent_doc_list = self.clause_extraction(sent_doc.as_doc())
         for sent_doc_l in sent_doc_list:
             sent_doc_t = DependencyTreeUtil.merge_np_of_np(sent_doc_l)
             simple_sentence = SimpleSentence(api_from, doc, sent_doc_t)
             if self.check_simple_sentence_valid(simple_sentence):
                 simple_sentence_list.append(simple_sentence)
     return simple_sentence_list
Пример #11
0
 def extract_simple_sentence(self, simple_sentence: SimpleSentence):
     doc = simple_sentence.get_doc()
     full_doc = simple_sentence.get_full_doc()
     api_name = simple_sentence.api_from
     main_predicate = DependencyTreeUtil.get_main_predicate(full_doc)
Пример #12
0
    def clause_extraction(self, sent_doc):

        predicate = DependencyTreeUtil.get_main_predicate(sent_doc)
        subject = DependencyTreeUtil.get_subject(sent_doc)
        sent_doc_list = []
        if predicate is None or subject is None:
            return sent_doc_list
        for i, token in enumerate(sent_doc):
            if token.dep_ == 'conj' and token.pos_ == 'VERB' and token.head == predicate:
                # added = True
                conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index(
                    sent_doc, i)
                is_complete = False
                # 判断是否有主语
                for nsubj_token in conj_predicate_span:
                    if nsubj_token.dep_.startswith('nsubj'):
                        is_complete = True
                        break
                if is_complete:
                    sent_doc_list.append(
                        self.self_doc(conj_predicate_span.text))
                else:
                    clause_text = subject.text + ' ' + conj_predicate_span.text
                    sent_doc_list.append(self.self_doc(clause_text))
                sent_doc_list.append(
                    self.self_doc(
                        DependencyTreeUtil.
                        get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence(
                            sent_doc, token, predicate).text))
                if sent_doc_list[1] is None or sent_doc_list[
                        0].text == sent_doc_list[1].text:
                    return list(sent_doc_list[0])
                if sent_doc_list[0].text == sent_doc.text or sent_doc_list[
                        1] == sent_doc.text:
                    return sent_doc_list
                final_sent_list = []
                for sent_doc_l in sent_doc_list:
                    final_sent_list.extend(self.clause_extraction(sent_doc_l))
                return final_sent_list
            if token.dep_ == 'relcl' and token.head.pos_ == 'NOUN':
                # added = True
                conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index(
                    sent_doc, i)
                that_flag = False
                for index, conj_token in enumerate(conj_predicate_span):
                    if (conj_token.tag_ == 'WDT' or conj_token.tag_
                            == "DT") and (conj_token.text == 'that'
                                          or conj_token.text == 'which'):
                        that_flag = True
                        if index > 1:
                            text = token.head.text + ' ' + conj_predicate_span[
                                0:index - 1].text + ' ' + conj_predicate_span[
                                    index + 1:len(conj_predicate_span) +
                                    1].text
                        else:
                            text = token.head.text + ' ' + conj_predicate_span[
                                index + 1:len(conj_predicate_span) + 1].text
                        sent_doc_list.append(self.self_doc(text))
                if not that_flag:
                    text_all = token.head.text + ' ' + conj_predicate_span.text
                    sent_doc_list.append(self.self_doc(text_all))
                sent_doc_list.append(
                    self.self_doc(
                        DependencyTreeUtil.
                        get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence(
                            sent_doc, token, predicate).text))
                if sent_doc_list[1] is None or sent_doc_list[
                        0].text == sent_doc_list[1].text:
                    return list(sent_doc_list[0])
                if sent_doc_list[0].text == sent_doc.text or sent_doc_list[
                        1] == sent_doc.text:
                    return sent_doc_list
                final_sent_list = []
                for sent_doc_l in sent_doc_list:
                    final_sent_list.extend(self.clause_extraction(sent_doc_l))
                return final_sent_list
            if token.dep_ == 'mark' and token.tag_ == 'IN' and token.lemma_.lower(
            ) not in self.mark_filter_words:
                # added = True
                conj_predicate_span = DependencyTreeUtil.get_subtree_span_from_one_token_index(
                    sent_doc, token.head.i)
                for index, conj_token in enumerate(conj_predicate_span):
                    if conj_token.dep_ == 'mark':
                        if index < len(conj_predicate_span
                                       ) - 1 and conj_predicate_span[
                                           index + 1].dep_ == 'mark':
                            continue
                        else:
                            conj_predicate_span = conj_predicate_span[index +
                                                                      1:]
                            break
                    if index < len(
                            conj_predicate_span) - 1 and conj_predicate_span[
                                index + 1].dep_ == 'mark':
                        continue
                doc1 = self.self_doc(conj_predicate_span.text)
                doc2 = DependencyTreeUtil.get_subtree_span_from_one_token_filter_another_token_sub_tree_for_split_sentence(
                    sent_doc, token.head, predicate)
                if doc2 is None:
                    sent_doc_list.append(doc1)
                    return sent_doc_list
                doc2 = self.self_doc(doc2.text)
                if doc1.text == doc2.text:
                    sent_doc_list.append(doc1)
                    return sent_doc_list
                sent_doc_list.append(doc1)
                sent_doc_list.append(doc2)
                if doc1.text == sent_doc.text or doc2.text == sent_doc.text:
                    return sent_doc_list
                final_sent_list = []
                for sent_doc_l in sent_doc_list:
                    final_sent_list.extend(self.clause_extraction(sent_doc_l))
                return final_sent_list
                # for mark_token in conj_predicate_span:
                # if mark_token.text != 'if':
        sent_doc_list.append(sent_doc)
        return sent_doc_list
Пример #13
0
 def self_doc(self, text):
     doc = self.nlp(text)
     doc = DependencyTreeUtil.merge_np_chunks(doc)
     doc = DependencyTreeUtil.merge_np_of_np(doc)
     return doc
Пример #14
0
    def get_feature_for_functionality(self, sent_doc, subject, predicate,
                                      doc_api_name):
        """
        某个功能有什么特征
        """
        # buffering character
        statement_record_list = []
        feature_name = ''
        for i, token in enumerate(sent_doc):
            if token.pos_ == 'ADV' and token.tag_ != 'WRB' and token.lemma_.lower(
            ) not in self.stop_feature and token.head == predicate:
                feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj_by_index(
                    sent_doc, token, i)
                if feature_span is None:
                    return statement_record_list, feature_name
                feature_name = feature_span.text
                neg = False
                for f in feature_span:
                    if f.dep_ == "neg":
                        neg = not neg
                condition_text = DependencyTreeUtil.get_conditions_text_for_token(
                    sent_doc, feature_span.root)
                extra_info = {
                    "condition": condition_text,
                    "core": feature_span.text,
                    "leading_verb": predicate.lemma_,
                    "neg": neg,
                    "compare_subject": '',
                    "compare_object": '',
                }
                info_from_set = set()
                info_from_set.add(
                    (ALLKnowledgeFromType.FROM_Text_Characteristic,
                     sent_doc.text, doc_api_name))
                start_name = " ".join(
                    sent_doc.text.replace(feature_name, "", 1).split())

                relation_data_tuple = StatementRecord(
                    start_name, RelationNameConstant.has_Feature_Relation,
                    feature_name, NPEntityType.FunctionalityType,
                    NPEntityType.CharacteristicType, self.extractor_name,
                    info_from_set, **extra_info)
                statement_record_list.append(relation_data_tuple)

            if token.dep_ == 'advcl' and token.head == predicate:
                ifflag = False
                if token is None:
                    continue
                for sub_token in token.lefts:
                    if sub_token.lemma_.lower() in self.mark_filter_words:
                        ifflag = True
                if ifflag:
                    continue
                feature_span = DependencyTreeUtil.get_subtree_span_from_one_token_obj_by_index(
                    sent_doc, token, i)
                if feature_span is None:
                    return statement_record_list, feature_name
                if token.left_edge.text in self.stop_feature:
                    return statement_record_list, feature_name
                if token.left_edge.text == 'except' and token.left_edge.dep_ == 'mark':
                    clause_flag = False
                    clause = []
                    for span_token in feature_span:
                        if span_token.dep_.startswith('nsubj'):
                            clause_flag = True
                        if span_token.dep_ != 'mark':
                            clause.append(span_token.text)
                    if clause_flag:
                        clause_doc = self.nlp(' '.join(clause))
                        clause_doc = DependencyTreeUtil.merge_np_chunks(
                            clause_doc)
                        clause_doc = DependencyTreeUtil.merge_np_of_np(
                            clause_doc)
                        clause_relation_tuple_result = self.extract_from_text(
                            clause_doc.text, doc_api_name)
                        statement_record_list.extend(
                            clause_relation_tuple_result)
                feature_name = feature_span.text
                neg = False
                for f in feature_span:
                    if f.dep_ == "neg":
                        neg = not neg
                condition_text = DependencyTreeUtil.get_conditions_text_for_token(
                    sent_doc, feature_span.root)
                lead_verb = ""
                for token in feature_span:
                    if token.head == token:
                        lead_verb = token
                extra_info = {
                    "condition": condition_text,
                    "core": feature_span.text,
                    "leading_verb": lead_verb.lemma_,
                    "neg": neg,
                    "compare_subject": '',
                    "compare_object": '',
                }
                feature_name = feature_name.replace(feature_span.root.text,
                                                    feature_span.root.lemma_)
                info_from_set = set()
                info_from_set.add(
                    (ALLKnowledgeFromType.FROM_Text_Characteristic,
                     sent_doc.text, doc_api_name))
                relation_data_tuple = StatementRecord(
                    subject.text, RelationNameConstant.has_Feature_Relation,
                    feature_name, NPEntityType.CategoryType,
                    NPEntityType.CharacteristicType, self.extractor_name,
                    info_from_set, **extra_info)
                statement_record_list.append(relation_data_tuple)

        return statement_record_list