def extract(): if request.method == 'POST': raw_text = request.form['rawtext'] rulerPlants = EntityRuler(nlp, overwrite_ents=True) flowers = ["rose", "tulip", "african daisy"] for f in flowers: rulerPlants.add_patterns([{"label": "flower", "pattern": f}]) # for animal entity animals = ["cat", "dog", "artic fox"] rulerAnimals = EntityRuler(nlp, overwrite_ents=True) for a in animals: rulerAnimals.add_patterns([{"label": "animal", "pattern": a}]) # for adding ruler name rulerPlants.name = 'rulerPlants' rulerAnimals.name = 'rulerAnimals' # adding entity to pipeline nlp.add_pipe(rulerPlants) nlp.add_pipe(rulerAnimals) # Reading document docx = nlp(raw_text) html = displacy.render(docx, style="ent") html = html.replace("\n\n", "\n") result = HTML_WRAPPER.format(html) return render_template('result.html', rawtext=raw_text, result=result)
def combine_entities_ruler(nlp): ''' Looks for patterns of multiple entites (i.e., LOC near LOC) and combines into single entity. Inputs: nlp: an nlp object Returns: combine_ruler: a spaCy EntityRuler object ''' patterns = [] combine_ruler = EntityRuler(nlp, validate=True, overwrite_ents=True) place_near_place = [{'LOWER': {'IN': DIRECTIONS}, 'OP': '?'}, {'LOWER': 'of', 'OP': '?'}, {'LOWER': 'the', 'OP': '?'}, {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}, {'LOWER': 'near'}, {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}] patterns.append({'label': 'SPEC_LOC', 'pattern': place_near_place}) place_between_place = [{'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}, {'LOWER': 'between'}, {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}, {'LOWER': 'and'}, {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}] patterns.append({'label': 'SPEC_LOC', 'pattern': place_between_place}) direction_place = [{'ENT_TYPE': 'GEN_LOC'}, {'ENT_TYPE': 'SPEC_LOC'}] patterns.append({'label': 'SPEC_LOC', 'pattern': direction_place}) isolated_scattered = [{'LOWER': {'IN': ['isolated', 'scattered']}, 'OP': '?'}, {'ENT_TYPE': 'LOC_TYPE', 'OP': '+'}] patterns.append({'label': 'LOC_TYPE', 'pattern': isolated_scattered}) combine_ruler.add_patterns(patterns) combine_ruler.name = 'combine_ruler' return combine_ruler
def preprocess(self): # clean text self.text = self.clean_abbrev( self.text) # remove dots in abbreviations self.text = self.clean_text(self.text) # remove special characters self.text = self.clean_empty_lines(self.text) # remove empty lines # add Definition Entity Recognizer ruler = EntityRuler(self.nlp, overwrite_ents=True) ruler.name = 'definition' patterns = [ { "label": "DEFINITION", "pattern": [{ "ORTH": "(" }, { 'IS_ALPHA': True }, { "ORTH": ")" }] }, #{"label": "DEFINITION", "pattern": [{"LOWER": "effective"}, {"LOWER": "date"}]}, #{"label": "GPE", "pattern": [{"LOWER": "united\n"}, {"LOWER": "states"}]} ] ruler.add_patterns(patterns) try: self.nlp.add_pipe(ruler) except ValueError as ve: self.nlp.remove_pipe(ruler.name) self.nlp.add_pipe(ruler)
def add_ruler(entity_name, entity_arr): ruler = EntityRuler(nlp, overwrite_ents=True) for d in entity_arr: ruler.add_patterns([{ "label": str(entity_name), "pattern": str(d) }]) ruler.name = str(entity_name) print('adding ent ', str(entity_name)) return ruler
def alsoknownas(sentence): from spacy.pipeline import EntityRuler nlp = spacy.load('en_core_web_sm', disable=['ner']) rulerAlKnAs = EntityRuler(nlp, overwrite_ents=True) answer = sentence answer = answer.translate(str.maketrans('', '', string.punctuation)) aka_patterns = [ text for text in ('known as', 'nicknamed', 'known mononymously as', 'known professionally as') ] str1 = "" str2 = "" str3 = "" str4 = "" label = "" for aka in aka_patterns: if aka in answer: a = answer.split(aka, 1)[-1] name = a.split()[0] surname1 = a.split()[1] surname2 = a.split()[2] surname3 = a.split()[3] str1 = name str2 = name + " " + surname1 str3 = name + " " + surname1 + " " + surname2 str4 = name + " " + surname1 + " " + surname2 + " " + surname3 tokens = nltk.word_tokenize(str4) pos = nltk.pos_tag(tokens) if (pos[0][1] in {"NNP", "NN"} and pos[1][1] not in {"NNP", "NN"}): label = str1 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] not in {"NNP", "NN"}): label = str2 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] in {"NNP", "NN"} and pos[3][1] not in {"NNP", "NN"}): label = str3 if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"} and pos[2][1] in {"NNP", "NN"} and pos[3][1] in {"NNP", "NN"}): label = str4 for aka in aka_patterns: rulerAlKnAs.add_patterns([{"label": label, "pattern": aka}]) rulerAlKnAs.name = 'rulerAlKnAs' nlp.add_pipe(rulerAlKnAs) doc = nlp(answer) for ent in doc.ents: return (ent.label_)
def ruler_model(df): rulerLocations = EntityRuler(nlp, overrite_ents=True) locations = [ 'Mount Kenya', 'Maasai Mara', 'Nyeri', 'Lake Naivasha', 'Mombasa', 'Nairobi', 'Lake Victoria', 'Mount Elgon', 'Nakuru', 'Kiambu', 'Olduvai Gorge', 'Zanzibar' ] for l in locations: rulerLocations.add_patterns([{'label': 'LOC', 'pattern': l}]) rulerLocations.name = 'rulerLocations' nlp.add_pipe(rulerLocations) docs = nlp.pipe(iter(df['text'])) for doc in docs: for ent in doc.ents: if ent.label_ == 'LOC': print([ent.text, ent.label_])
def addAddressEntity(self, text): addrlist = self.getAddressList(text) cnt = 0 for a in addrlist: cnt = cnt + 1 #print(a) ruler = EntityRuler(self.nlp, overwrite_ents=True) ruler.name = "addr_" + str(cnt) pattern = [{"label": "ADDRESS", "pattern": a}] ruler.add_patterns(pattern) try: self.nlp.add_pipe(ruler) except ValueError as ve: self.nlp.remove_pipe(ruler.name) self.nlp.add_pipe(ruler)
def nameofperson(sentence): from spacy.pipeline import EntityRuler nlp = spacy.load('en_core_web_sm', disable=['ner']) rulername = EntityRuler(nlp, overwrite_ents=True) answer = sentence #answer = answer.translate(str.maketrans('','',string.punctuation)) answer = answer.split('(')[0] answer = answer.split(',')[0] label = answer rulername.add_patterns([{"label": label, "pattern": answer}]) rulername.name = 'rulername' nlp.add_pipe(rulername) doc = nlp(answer) for ent in doc.ents: return (ent.label_)
def birthdate(sentence): from spacy.pipeline import EntityRuler nlp = spacy.load('en_core_web_sm', disable=['ner']) rulerdate = EntityRuler(nlp, overwrite_ents=True) answer = sentence date_patterns = ['born', '–', '–'] date = "" answer = answer.split('(')[1] if '–' in answer: a = answer.split(')')[0] day = a.split()[0] month = a.split()[1] year = a.split()[2] if 'born' in answer: a = answer.split(')')[0] day = a.split()[1] month = a.split()[2] year = a.split()[3] if '–' in answer: a = answer.split(')')[0] day = a.split()[1] month = a.split()[2] year = a.split()[3] date = day + " " + month + " " + year label = date for date in date_patterns: rulerdate.add_patterns([{"label": label, "pattern": date}]) rulerdate.name = 'rulerdate' nlp.add_pipe(rulerdate) doc = nlp(answer) for ent in doc.ents: return (ent.label_)
def extract_json(raw_text): # TITLE title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity # Title for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Add rulerAll to patterns rulerAll = EntityRuler(nlp, overwrite_ents=True) # For Title entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # HOLISTIC NAME FOR RULER rulerAll.name = 'rulerAll' if 'rulerAll' not in nlp.pipe_names: nlp.add_pipe(rulerAll) with nlp.disable_pipes('ner'): doc = nlp(raw_text) threshold = 0.2 beams = nlp.entity.beam_parse([doc], beam_width=3, beam_density=0.0001) entity_scores = defaultdict(float) for beam in beams: for score, ents in nlp.entity.moves.get_beam_parses(beam): for start, end, label in ents: entity_scores[(start, end, label)] += score ent_custom = [] ent_label = [] ent_score = [] for key in entity_scores: start, end, label = key score = entity_scores[key] if (score > threshold): ent_custom.append(label) ent_label.append(str(doc[start:end])) ent_score.append(score) df_ent_score = pd.DataFrame({'ENT_DETECT': [], 'ENT_LABEL': [], 'CONFIDENCE': []}) df_ent_score['ENT_DETECT']=ent_custom df_ent_score['ENT_LABEL']=ent_label df_ent_score['CONFIDENCE']=ent_score df_custom_ent=df_ent_score[(df_ent_score.ENT_DETECT=="TITLE") | (df_ent_score.ENT_DETECT=="CLIENT") |(df_ent_score.ENT_DETECT=="SUPPLIER") | (df_ent_score.ENT_DETECT=="COUNTRIES")| (df_ent_score.ENT_DETECT=="Effective-DATES")| (df_ent_score.ENT_DETECT=="Signature-DATES") | (df_ent_score.ENT_DETECT=="Termination-DATES")| (df_ent_score.ENT_DETECT=="Commencement-DATES")| (df_ent_score.ENT_DETECT=="END-DATES") | (df_ent_score.ENT_DETECT=="CLIENT_CONTRACT_MANAGER")| (df_ent_score.ENT_DETECT=="SUPPLIER_CONTRACT_MANAGER") | (df_ent_score.ENT_DETECT=="DATE")] df_ent_dup=df_custom_ent.copy() df_ent_dup = df_ent_dup.drop_duplicates(subset=["ENT_DETECT"]) df_ent_dup=df_ent_dup.reset_index(drop=True) df_ent_dup.index = df_ent_dup.index + 1 json_table = df_ent_dup.to_json(orient='index') return json_table
datesOrdered = dates == sorted(dates, reverse=True) else: noDates = True # Dates Present if dtFormat == True: dateScore += 3 if datesOrdered == True: dateScore += 1 if noDates == False: dateScore += 6 skill_ruler = EntityRuler( nlp, overwrite_ents=True).from_disk('./skill_patterns.jsonl') verb_ruler = EntityRuler( nlp, overwrite_ents=True).from_disk('./actionVerbs.jsonl') coms_ruler = EntityRuler(nlp, overwrite_ents=True).from_disk('./coms.jsonl') skill_ruler.name = 'skillRuler' verb_ruler.name = 'verbRuler' coms_ruler.name = 'comsRuler' nlp.add_pipe(skill_ruler, after='parser') skillset_dict = create_skillset_dict([nlp(resume_text)]) nlp.disable_pipes('skillRuler') nlp.add_pipe(verb_ruler, after='parser') wordset_dict_verb = create_skillset_dict_verb([nlp(resume_text)]) wordset_dict_sumr = create_skillset_dict_verb( [nlp(' '.join(resume_raw_text.splitlines()[:16]))]) nlp.disable_pipes('verbRuler') nlp.add_pipe(coms_ruler, after='parser') wordset_dict_coms = create_skillset_dict_verb([nlp(resume_text)]) # vacature_skillset = create_skill_set(nlp(resume_text)) repetition = [] singles = []
def main(): """Summary AND NER App""" st.title("LEGAL TECH") activities = ['Extract MetaData From TEXT', "Extract Metadata From Text File", "Extract MetaData from .docx File", "Extract MetaData from .pdf File", "Find key entities in document"] choice = st.sidebar.selectbox("Select Activity", activities) if choice == 'Extract MetaData From TEXT': st.subheader("Extract MetaData from Legal Documents") raw_text = st.text_area("Enter Text Here", "Type Here") # Read text as spacy token instant def analyze_text(text): return nlp(text) # Add all CUSTOM patterns in respective entity # PATTERN FOR TITLES # Applying NLP ideas title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] # Define rulerAll for all entities rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) # Getting text input if st.button("Extract"): docx2 = analyze_text(raw_text) html = displacy.render(docx2, style="ent") html = html.replace("\n\n", "\n") # st.write(html, unsafe_allow_html=True) st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) # FOR SECOND CHALLENGE....reading from file if choice == 'Extract Metadata From Text File': st.subheader("Extract MetaData from Given Text File") def file_selector(folder_path='.'): filenames = os.listdir(folder_path) selected_filename = st.selectbox('Please only Select a Text File', filenames) return os.path.join(folder_path, selected_filename) filename = file_selector() f = open(filename) st.write('You have selected `%s`' % filename) raw=f.read() raw_text2 = st.text_area("your file contains following text", raw) # for storing in raw text # DEFINE ANALYISIS FUNCTION def analyze_text(text): return nlp(text) # Applying NLP ideas title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] # Define rulerAll for all entities rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) # analysis from loaded file if st.button("Extract"): document = analyze_text(raw_text2) html = displacy.render(document, style="ent") html = html.replace("\n\n", "\n") # st.write(html, unsafe_allow_html=True) st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) # FOR THIRD CHALLENGE....reading from file if choice == 'Extract MetaData from .docx File': st.subheader("Extract MetaData from .docx File") def file_selector(folder_path='.'): filenames = os.listdir(folder_path) selected_filename = st.selectbox('Please only Select a .docx File', filenames) return os.path.join(folder_path, selected_filename) filename = file_selector() f = open(filename) st.write('You have selected `%s`' % filename) docx_file = docx2txt.process(filename) #raw = docx_file.read() raw_text3 = st.text_area("your file contains following text", docx_file) # for storing in raw text # DEFINE ANALYISIS FUNCTION def analyze_text(text): return nlp(text) # Applying NLP ideas title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] # Define rulerAll for all entities rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) # analysis from loaded file if st.button("Extract"): document3 = analyze_text(raw_text3) html = displacy.render(document3, style="ent") html = html.replace("\n\n", "\n") st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) # FOR FOURTH CHALLENGE....reading from PDF file if choice == 'Extract MetaData from .pdf File': st.subheader("Extract MetaData from .pdf File") def file_selector(folder_path='.'): filenames = os.listdir(folder_path) selected_filename = st.selectbox('Please only Select a .docx File', filenames) return os.path.join(folder_path, selected_filename) filename = file_selector() f = open(filename) st.write('You have selected `%s`' % filename) with open(filename, "rb") as pdf: pdf_file = pdftotext.PDF(pdf) # converting pdftotext.PDF to string type pdf_text = ("\n\n".join(pdf_file)) raw_text4 = st.text_area("your file contains following text", pdf_text) # for storing in raw text # DEFINE ANALYISIS FUNCTION def analyze_text(text): return nlp(text) # Applying NLP ideas title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] # Define rulerAll for all entities rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) # analysis from loaded file if st.button("Extract"): document4 = analyze_text(raw_text4) html = displacy.render(document4, style="ent") html = html.replace("\n\n", "\n") st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True) # FOR FIFTH CHALLENGE....reading from file if choice == 'Find key entities in document': st.subheader("Extract MetaData from Given Text File") def file_selector(folder_path='.'): filenames = os.listdir(folder_path) selected_filename = st.selectbox('Please only Select a Text File', filenames) return os.path.join(folder_path, selected_filename) filename = file_selector() f = open(filename) st.write('You have selected `%s`' % filename) raw = f.read() raw_text5 = st.text_area("your file contains following text", raw) # for storing in raw text # DEFINE ANALYISIS FUNCTION def analyze_text(text): return nlp(text) # Applying NLP ideas title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"] clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"] dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"] countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"] # Define rulerAll for all entities rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) # analysis from loaded file if st.button("Extract"): doc5 = analyze_text(raw_text5) # Adding a Dataframe df_ent = pd.DataFrame({'TEXT': [], 'ENT_LABEL': [], 'START': [], 'END': []}) df_ent['TEXT'] = [x.text for x in doc5.ents] df_ent['ENT_LABEL'] = [x.label_ for x in doc5.ents] df_ent['START'] = [x.start_char for x in doc5.ents] df_ent['END'] = [x.end_char for x in doc5.ents] # Filering entities to be shown df_table = df_ent[ (df_ent.ENT_LABEL == "TITLE") | (df_ent.ENT_LABEL == "CLIENT") | (df_ent.ENT_LABEL == "SUPPLIER") | (df_ent.ENT_LABEL == "COUNTRIES") | (df_ent.ENT_LABEL == "Effective-DATES") | ( df_ent.ENT_LABEL == "Signature-DATES") | (df_ent.ENT_LABEL == "Termination-DATES") | (df_ent.ENT_LABEL == "Commencement-DATES") | ( df_ent.ENT_LABEL == "END-DATES") | (df_ent.ENT_LABEL == "CLIENT_CONTRACT_MANAGER") | (df_ent.ENT_LABEL == "SUPPLIER_CONTRACT_MANAGER")] df_ent_dup = df_table.copy() df_ent_dup = df_ent_dup.drop_duplicates(subset=["ENT_LABEL"]) # Output shown on app st.write('Key Entities Found in Contract : ', df_ent_dup)
rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Define Ruler for All rulerAll.name = 'rulerAll' nlp.add_pipe(rulerAll) HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem">{}</div>""" from flaskext.markdown import Markdown app = Flask(__name__) Markdown(app) # def analyze_text(text): # return nlp(text)
def extracts(raw_text): nlp = spacy.load('en_core_web_sm', disable=['ner']) title1 = ["Agreement on contract1"] title2 = ["Agreement on contract2"] title3 = ["Agreement on contract3"] title4 = ["Agreement on contract4"] title5 = ["Agreement on contract5"] title6 = ["Agreement on contract6"] title7 = ["Agreement on contract7"] title8 = ["Agreement on contract8"] title9 = ["Agreement on contract9"] suppliers1 = [ "FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG" ] suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"] suppliers3 = ["Google GmbH", "GOOGLE"] suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"] suppliers5 = [ "AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited" ] clients = [ "BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH" ] dates1 = [ "29 September 2018", "01 January 2015", "01.07.2018", " August 2017" ] dates2 = ["31. July 2018"] dates3 = ["termination after a period of 48 months"] dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"] dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"] countries1 = [ "UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India" ] countries2 = [ "Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania" ] rulerAll = EntityRuler(nlp, overwrite_ents=True) # Add all patterns in respective entity # Title for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # Add rulerAll to patterns rulerAll = EntityRuler(nlp, overwrite_ents=True) # For Title entity for tit1 in title1: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}]) for tit2 in title2: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}]) for tit3 in title3: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}]) for tit4 in title4: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}]) for tit5 in title5: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}]) for tit6 in title6: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}]) for tit7 in title7: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}]) for tit8 in title8: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}]) for tit9 in title9: rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}]) # for supplier for s1 in suppliers1: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}]) for s2 in suppliers2: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}]) for s3 in suppliers3: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}]) for s4 in suppliers4: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}]) for s5 in suppliers5: rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}]) # for clients for c1 in clients: rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}]) # Pattern for DATES for t1 in dates1: rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}]) for t2 in dates2: rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}]) for t3 in dates3: rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}]) for t4 in dates4: rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}]) for t5 in dates5: rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}]) # for countries for count1 in countries1: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}]) for count2 in countries2: rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}]) # HOLISTIC NAME FOR RULER rulerAll.name = 'rulerAll' if 'rulerAll' not in nlp.pipe_names: nlp.add_pipe(rulerAll) # take raw text data docx = nlp(raw_text) html = displacy.render(docx, style="ent") html = html.replace("\n\n", "\n") result = HTML_WRAPPER.format(html) return result