def entity_combined_with_scenario(): sentences = text_into_sentence() sleep(10) for sentence in sentences: entity_list = [] word_list = nltk.word_tokenize(sentence) # print(sentence) for word in word_list: word_new = find_entities(word) if word_new is not None: sentence_list_has_entities.append(sentence) entity_list.append(word_new) # Remove duplicates in entity list if len(entity_list) >= 2: sleep(10) duplicate_removed_entity_list = list(set(entity_list)) if len(duplicate_removed_entity_list) >= 2: for entity in duplicate_removed_entity_list: lem_entity = lemmatizer.lemmatize(entity) new_list = duplicate_removed_entity_list[ duplicate_removed_entity_list.index(entity) + 1:len(entity_list)] print(new_list) for entity_1 in new_list: lem_entity_1 = lemmatizer.lemmatize(entity_1) if entity_1 == lem_entity: duplicate_removed_entity_list.remove(lem_entity) break elif entity == lem_entity_1: duplicate_removed_entity_list.remove(entity) break # duplicate_removed_entity_list = list(set(entity_list)) find_relationship(duplicate_removed_entity_list, sentence) print("+++++++++")
def find_entities(word): root = get_root_of_input_xml() for entity_ref in root.findall('entity'): entity = entity_ref.get('name') # print(entity) entity_singular = lemmatizer.lemmatize(entity) word_singular = lemmatizer.lemmatize(word) if word == entity or word == entity_singular or word_singular == entity_singular: return word
def get_sentences_match_with_entities_binary(member1, member2, relationship): matching_sentences_list = [] sentence_list = text_into_sentence() lem_member1 = lemmatizer.lemmatize(member1) lem_member2 = lemmatizer.lemmatize(member2) new_relationship_list = relationship.split('_') if len(new_relationship_list) > 1: correct_relationship = new_relationship_list[1] else: correct_relationship = new_relationship_list[0] relationship_lem = lemmatizer.lemmatize(correct_relationship, pos="v") # regular expressions for find relevant sentences regex_1 = r"" + re.escape(member1) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(member2) regex_2 = r"" + re.escape(member1) + "(.*)" + re.escape( relationship_lem) + "(.*)" + re.escape(member2) regex_3 = r"" + re.escape(lem_member1) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(member2) regex_4 = r"" + re.escape(lem_member1) + "(.*)" + re.escape( relationship_lem) + "(.*)" + re.escape(member2) regex_5 = r"" + re.escape(lem_member1) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(lem_member2) regex_6 = r"" + re.escape(member2) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(member1) regex_7 = r"" + re.escape(member2) + "(.*)" + re.escape( relationship_lem) + "(.*)" + re.escape(member1) regex_8 = r"" + re.escape(lem_member2) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(member1) regex_9 = r"" + re.escape(lem_member2) + "(.*)" + re.escape( relationship_lem) + "(.*)" + re.escape(member1) regex_10 = r"" + re.escape(lem_member2) + "(.*)" + re.escape( correct_relationship) + "(.*)" + re.escape(lem_member1) for sentence in sentence_list: if re.search(regex_1, sentence, re.MULTILINE | re.IGNORECASE) or re.search(regex_2, sentence, re.MULTILINE | re.IGNORECASE) or re.search( regex_3, sentence, re.MULTILINE | re.IGNORECASE) or re.search(regex_4, sentence, re.MULTILINE | re.IGNORECASE) or re.search( regex_5, sentence, re.MULTILINE | re.IGNORECASE) \ or re.search(regex_6, sentence, re.MULTILINE | re.IGNORECASE) or re.search(regex_7, sentence, re.MULTILINE | re.IGNORECASE) or re.search( regex_8, sentence, re.MULTILINE | re.IGNORECASE) or re.search(regex_9, sentence, re.MULTILINE | re.IGNORECASE) or re.search( regex_10, sentence, re.MULTILINE | re.IGNORECASE): print(sentence) matching_sentences_list.append(sentence) return matching_sentences_list
def remove_duplicate_of_relationship_list_binary(): new_list = [] for dic in binary_relationship_dic_list: member1 = dic.get('member1') member2 = dic.get('member2') lem_mem1 = lemmatizer.lemmatize(member1) lem_mem2 = lemmatizer.lemmatize(member2) index = binary_relationship_dic_list.index(dic) for new_dic in binary_relationship_dic_list: new_index = binary_relationship_dic_list.index(new_dic) if index == new_index: continue else: new_member1 = new_dic.get('member1') new_member2 = new_dic.get('member2') n_lem_mem1 = lemmatizer.lemmatize(new_member1) n_lem_mem2 = lemmatizer.lemmatize(new_member2) if (member1 == new_member1 and member2 == new_member2) or \ (member1 == n_lem_mem1 and member2 == n_lem_mem2) or \ (lem_mem1 == new_member1 and lem_mem2 == new_member2) or \ (member2 == new_member1 and member1 == new_member2) or \ (member2 == n_lem_mem1 and member1 == n_lem_mem2) or \ (lem_mem2 == new_member1 and lem_mem1 == new_member2) or ( lem_mem1 == new_member2 and member2 == n_lem_mem1): tokenize_member1 = nltk.word_tokenize(member1) tag_member1 = nltk.pos_tag(tokenize_member1) tokenize_member2 = nltk.word_tokenize(member2) tag_member2 = nltk.pos_tag(tokenize_member2) new_tokenize_member1 = nltk.word_tokenize(new_member1) new_tag_member1 = nltk.pos_tag(new_tokenize_member1) new_tokenize_member2 = nltk.word_tokenize(new_member2) new_tag_member2 = nltk.pos_tag(new_tokenize_member2) if tag_member1[0][1] == 'NNS' or tag_member2[0][1] == 'NNS': binary_relationship_dic_list.remove(new_dic) elif new_tag_member1[0][1] == 'NNS' or new_tag_member2[0][ 1] == 'NNS': binary_relationship_dic_list.remove(dic) else: binary_relationship_dic_list.remove(dic) # print(relationship_dic_list) return binary_relationship_dic_list
def find_primary_key(member): root = get_root_of_input_xml() lem_member = lemmatizer.lemmatize(member) for entity_ref in root.findall('entity'): entity = entity_ref.get('name') if entity == member or entity == lem_member: for attri_ref in entity_ref.findall('attribute'): if attri_ref.get('value') == "primary_key": return attri_ref.get('name')
def get_unary_cardinality_list(): unary_cardinality_list = [] for dic in unary_relationship_dic_list: relation = dic.get('relationship') plural_member = dic.get("member") member = lemmatizer.lemmatize(plural_member) primary_key = find_primary_key(member) unary_cardinality_list.append({ "@name": relation, "@degree": "unary", "@type": "one_to_one", "member1": { "@name": member, "@cardinality": "one", "@primary_key": primary_key }, "member2": { "@name": member, "@cardinality": "one", "@primary_key": primary_key } }) print(unary_cardinality_list) return unary_cardinality_list
def find_relationship(entity_list, sentence): global ternary_relationship_list word_list = sentences_into_word(sentence) pos_tag_list = nltk.pos_tag(word_list) entity_and_index_list = [] # Identify Unary relationships if len(entity_list) == 1: member = lemmatizer.lemmatize(entity_list[0]) # Eliminate entity names as attributes_extractor regex_for_unary_1 = r"(.*)(" + re.escape( member) + ")(.*,.*,.*)(" + re.escape(member) + ")(.*)" regex_for_unary_2 = r"(.*)(" + re.escape( member) + ")(.*)(" + re.escape(member) + ")(.*)(,)(.*)(,)(.*)" regex_for_unary_3 = r"(.*)(" + re.escape( member) + ")(.*)(identified)(.*)(" + re.escape(member) + ")(.*)" print("Unary", entity_list, "sentence", sentence) if (not (re.search(regex_for_unary_1, sentence))) and ( not (re.search(regex_for_unary_2, sentence)) and (not (re.search(regex_for_unary_3, sentence)))): relationship_list = [] for word in pos_tag_list: if re.search(regex_for_verb_tags, word[1]): relationship_list.append(word[0]) if len(relationship_list) > 1: relationship = random.choice(relationship_list) else: relationship = relationship_list[0] unary_relationship_dic_list.append({ "member": member, "relationship": relationship, "sentence": sentence }) print("Unary Relationship List", unary_relationship_dic_list) # Identify Ternary Relationships elif len(entity_list) == 3: member1 = entity_list[0] member2 = entity_list[1] member3 = entity_list[2] regex_for_ternary_sentence_elimination = r"(.*)(" + re.escape( member1) + "|" + re.escape(member2) + "|" + re.escape( member3) + ".*)(,)(.*" + re.escape(member1) + "|" + re.escape( member2) + "|" + re.escape( member3) + ".*)(,)(.*)(" + re.escape( member1) + "|" + re.escape( member2) + "|" + re.escape(member3) + ")" if not (re.search(regex_for_ternary_sentence_elimination, sentence)): # find related sentences regex_3_for_identify_related_sentence_part = r"(" + re.escape( member1) + "|" + re.escape(member2) + "|" + re.escape( member3) + ")" + "(.*)" + "(" + re.escape( member1) + "|" + re.escape(member2) + "|" + re.escape( member3) + ")" matches = re.finditer(regex_3_for_identify_related_sentence_part, sentence, re.MULTILINE | re.IGNORECASE) verb_list = [] for matchNum, match in enumerate(matches, start=1): for groupNum in range(0, len(match.groups())): print(match.group(2)) relationship_content_sentence_part = match.group(2) word_list_1 = sentences_into_word( relationship_content_sentence_part) pos_tag_list_1 = nltk.pos_tag(word_list_1) print(pos_tag_list_1) for word in pos_tag_list_1: if re.search(regex_for_verb_tags, word[1]): verb_list.append(word[0]) # print(verb_list) verb_set = set(verb_list) without_duplicate_verb_list = list(verb_set) if len(without_duplicate_verb_list) > 1: for verb in without_duplicate_verb_list: if verb == 'is' or verb == 'has' or verb == 'are' or verb == 'have': without_duplicate_verb_list.remove(verb) # print(without_duplicate_verb_list) ternary_relationship = random.choice( without_duplicate_verb_list) # print(ternary_relationship) relationship_dic_1 = { 'member1': member1, 'relationship': ternary_relationship, 'member2': member2, 'member3': member3 } ternary_relationship_list.append(relationship_dic_1) elif len(without_duplicate_verb_list) == 1: ternary_relationship = verb_list[0] relationship_dic_1 = { 'member1': member1, 'relationship': ternary_relationship, 'member2': member2, 'member3': member3 } ternary_relationship_list.append(relationship_dic_1) else: for word in pos_tag_list: if re.search(regex_for_verb_tags, word[1]): verb_list.append(word[0]) if len(verb_list) > 1: ternary_relationship = random.choice(verb_list) elif len(verb_list) == 1: ternary_relationship = verb_list[0] else: ternary_relationship = "relate" relationship_dic_1 = { 'member1': member1, 'relationship': ternary_relationship, 'member2': member2, 'member3': member3 } ternary_relationship_list.append(relationship_dic_1) else: for data in pos_tag_list: for entity in entity_list: if data[0] == entity or data[0] == lemmatizer.lemmatize( entity): index = pos_tag_list.index(data) entity_and_index_list.append({ 'member': entity, 'index': index }) if len(entity_and_index_list) == 2: first_index = entity_and_index_list[0].get('index') second_index = entity_and_index_list[1].get( 'index') + 1 first_member = entity_and_index_list[0].get('member') second_member = entity_and_index_list[1].get('member') regex_1_identify_entities = r"" + re.escape( first_member) + " (of each) " + re.escape( second_member) regex_2_identify_entities = r"" + re.escape( second_member) + " (of each) " + re.escape( first_member) temp_list = pos_tag_list[first_index:second_index] relationship_list = [] count = 0 for data in temp_list: if re.search(regex_for_verb_tags, data[1]): relationship_list.append(data[0]) count = count + 1 if count < 2: relationship_identified_sentence_list.append( sentence) if relationship_list: if len(relationship_list) > 1: relationship = relationship_list[ 0] + '_' + relationship_list[1] else: relationship = relationship_list[0] member1 = entity_and_index_list[1].get('member') member2 = entity_and_index_list[0].get('member') relationship_dic = { 'member1': member1, 'relationship': relationship, 'member2': member2 } binary_relationship_dic_list.append( relationship_dic) elif re.search( regex_1_identify_entities, sentence, re.MULTILINE | re.IGNORECASE) or re.search( regex_2_identify_entities, sentence, re.MULTILINE | re.IGNORECASE): member1 = entity_and_index_list[1].get('member') member2 = entity_and_index_list[0].get('member') relationship_dic = { 'member1': member1, 'relationship': "related_with", 'member2': member2 } binary_relationship_dic_list.append( relationship_dic) print("Binary", binary_relationship_dic_list) print("Ternary", ternary_relationship_list) print("Unary", unary_relationship_dic_list) return binary_relationship_dic_list, ternary_relationship_list, unary_relationship_dic_list
def get_ternary_cardinality_list(): for dic in ternary_relationship_list: member1 = dic.get('member1') member2 = dic.get('member2') member3 = dic.get('member3') relation = dic.get('relationship') sentence_list = get_sentences_match_with_entities_ternary( member1, member2, member3, relation) member1_primary_key = find_primary_key(member1) member2_primary_key = find_primary_key(member2) member3_primary_key = find_primary_key(member3) singular_member1 = lemmatizer.lemmatize(member1) singular_member2 = lemmatizer.lemmatize(member2) singular_member3 = lemmatizer.lemmatize(member3) if find_cardinality_many(member1, sentence_list): if find_cardinality_many(member2, sentence_list): if find_cardinality_many(member3, sentence_list): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "many_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "many", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }), elif find_cardinality_one(member3, sentence_list, relation): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "one", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }), elif find_cardinality_one(member2, sentence_list, relation): if find_cardinality_many(member3, sentence_list): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "many", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }), elif find_cardinality_one(member3, sentence_list, relation): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "one", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }) elif find_cardinality_one(member1, sentence_list, relation): if find_cardinality_many(member2, sentence_list): if find_cardinality_many(member3, sentence_list): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "many", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }), elif find_cardinality_one(member3, sentence_list, relation): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "one", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }) elif find_cardinality_one(member2, sentence_list, relation): if find_cardinality_many(member3, sentence_list): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "many", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }), elif find_cardinality_one(member3, sentence_list, relation): ternary_relation_list.append({ "@name": relation, "@degree": "ternary", "@type": "one_to_one", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member3": { "@name": singular_member3, "@cardinality": "one", "@primary_key": member3_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }) return ternary_relation_list
def get_binary_cardinality_list(): new_relationship_dic_list_binary = remove_duplicate_of_relationship_list_binary( ) for dic in new_relationship_dic_list_binary: plural_member1 = dic.get('member1') # print(member1) plural_member2 = dic.get('member2') # print(member2) relationship = dic.get('relationship') # print(relationship) sentence_list = get_sentences_match_with_entities_binary( plural_member1, plural_member2, relationship) sentence_set = list(set(sentence_list)) # print(sentence_set) member1_primary_key = find_primary_key(plural_member1) member2_primary_key = find_primary_key(plural_member2) # print(member1, " primary key is : ", member1_primary_key) # print(member2, " primary key is : ", member2_primary_key) singular_member1 = lemmatizer.lemmatize(plural_member1) singular_member2 = lemmatizer.lemmatize(plural_member2) if find_cardinality_many(plural_member1, sentence_set): if find_cardinality_many(plural_member2, sentence_set): binary_relation_list.append({ "@name": relationship, "@degree": "binary", "@type": "many_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }) elif find_cardinality_one(plural_member2, sentence_set, relationship): binary_relation_list.append({ "@name": relationship, "@degree": "binary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "many", "@primary_key": member1_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }) elif find_cardinality_one(plural_member1, sentence_set, relationship): if find_cardinality_many(plural_member2, sentence_set): singular_member1 = lemmatizer.lemmatize(plural_member1) singular_member2 = lemmatizer.lemmatize(plural_member2) binary_relation_list.append({ "@name": relationship, "@degree": "binary", "@type": "one_to_many", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "many", "@primary_key": member2_primary_key } }) elif find_cardinality_one(plural_member2, sentence_set, relationship): binary_relation_list.append({ "@name": relationship, "@degree": "binary", "@type": "one_to_one", "member1": { "@name": singular_member1, "@cardinality": "one", "@primary_key": member1_primary_key }, "member2": { "@name": singular_member2, "@cardinality": "one", "@primary_key": member2_primary_key } }) # ............................... if find_cardinality_many(plural_member1, sentence_set): if find_cardinality_many(plural_member2, sentence_set): many_to_many_relationship_list.append({ 'member1': plural_member1, 'member2': plural_member2, 'relationship': relationship }) elif find_cardinality_one(plural_member2, sentence_set, relationship): one_to_many_relationship_list.append({ 'member1': plural_member1, 'member2': plural_member2, 'relationship': relationship }) elif find_cardinality_one(plural_member1, sentence_set, relationship): if find_cardinality_many(plural_member2, sentence_set): one_to_many_relationship_list.append({ 'member1': plural_member1, 'member2': plural_member2, 'relationship': relationship }) elif find_cardinality_one(plural_member2, sentence_set, relationship): one_to_one_relationship_list.append({ 'member1': plural_member1, 'member2': plural_member2, 'relationship': relationship }) # print("1 2 1", one_to_one_relationship_list) # print("1 2 M", one_to_many_relationship_list) # print("M 2 M", many_to_many_relationship_list) print("rel", binary_relation_list) return binary_relation_list