def extract():
    if request.method == 'POST':
        raw_text = request.form['rawtext']
        rulerPlants = EntityRuler(nlp, overwrite_ents=True)
        flowers = ["rose", "tulip", "african daisy"]
        for f in flowers:
            rulerPlants.add_patterns([{"label": "flower", "pattern": f}])
        # for animal entity
        animals = ["cat", "dog", "artic fox"]
        rulerAnimals = EntityRuler(nlp, overwrite_ents=True)
        for a in animals:
            rulerAnimals.add_patterns([{"label": "animal", "pattern": a}])

        # for adding ruler name
        rulerPlants.name = 'rulerPlants'
        rulerAnimals.name = 'rulerAnimals'
        # adding entity to pipeline
        nlp.add_pipe(rulerPlants)
        nlp.add_pipe(rulerAnimals)
        # Reading document
        docx = nlp(raw_text)
        html = displacy.render(docx, style="ent")
        html = html.replace("\n\n", "\n")
        result = HTML_WRAPPER.format(html)

        return render_template('result.html', rawtext=raw_text, result=result)
示例#2
0
def combine_entities_ruler(nlp):
    '''
    Looks for patterns of multiple entites (i.e., LOC near LOC) and combines into single entity.
    Inputs:
        nlp: an nlp object
    Returns:
        combine_ruler: a spaCy EntityRuler object
    '''
    patterns = []
    combine_ruler = EntityRuler(nlp, validate=True, overwrite_ents=True)
    place_near_place = [{'LOWER': {'IN': DIRECTIONS}, 'OP': '?'},
                        {'LOWER': 'of', 'OP': '?'},
                        {'LOWER': 'the', 'OP': '?'},
                        {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}},
                        {'LOWER': 'near'},
                        {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}]
    patterns.append({'label': 'SPEC_LOC', 'pattern': place_near_place})
    place_between_place = [{'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}},
                        {'LOWER': 'between'},
                        {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}},
                        {'LOWER': 'and'},
                        {'ENT_TYPE': {'IN': ['GEN_LOC', 'SPEC_LOC']}}]
    patterns.append({'label': 'SPEC_LOC', 'pattern': place_between_place})
    direction_place = [{'ENT_TYPE': 'GEN_LOC'},
                        {'ENT_TYPE': 'SPEC_LOC'}]
    patterns.append({'label': 'SPEC_LOC', 'pattern': direction_place})
    isolated_scattered = [{'LOWER': {'IN': ['isolated', 'scattered']}, 'OP': '?'},
                         {'ENT_TYPE': 'LOC_TYPE', 'OP': '+'}]
    patterns.append({'label': 'LOC_TYPE', 'pattern': isolated_scattered})
    combine_ruler.add_patterns(patterns)
    combine_ruler.name = 'combine_ruler'

    return combine_ruler
示例#3
0
    def preprocess(self):

        # clean text
        self.text = self.clean_abbrev(
            self.text)  # remove dots in abbreviations
        self.text = self.clean_text(self.text)  # remove special characters
        self.text = self.clean_empty_lines(self.text)  # remove empty lines

        # add Definition Entity Recognizer
        ruler = EntityRuler(self.nlp, overwrite_ents=True)
        ruler.name = 'definition'
        patterns = [
            {
                "label": "DEFINITION",
                "pattern": [{
                    "ORTH": "("
                }, {
                    'IS_ALPHA': True
                }, {
                    "ORTH": ")"
                }]
            },
            #{"label": "DEFINITION", "pattern": [{"LOWER": "effective"}, {"LOWER": "date"}]},
            #{"label": "GPE", "pattern": [{"LOWER": "united\n"}, {"LOWER": "states"}]}
        ]

        ruler.add_patterns(patterns)
        try:
            self.nlp.add_pipe(ruler)
        except ValueError as ve:
            self.nlp.remove_pipe(ruler.name)
            self.nlp.add_pipe(ruler)
示例#4
0
 def add_ruler(entity_name, entity_arr):
     ruler = EntityRuler(nlp, overwrite_ents=True)
     for d in entity_arr:
         ruler.add_patterns([{
             "label": str(entity_name),
             "pattern": str(d)
         }])
     ruler.name = str(entity_name)
     print('adding ent ', str(entity_name))
     return ruler
def alsoknownas(sentence):

    from spacy.pipeline import EntityRuler
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    rulerAlKnAs = EntityRuler(nlp, overwrite_ents=True)

    answer = sentence
    answer = answer.translate(str.maketrans('', '', string.punctuation))

    aka_patterns = [
        text for text in ('known as', 'nicknamed', 'known mononymously as',
                          'known professionally as')
    ]

    str1 = ""
    str2 = ""
    str3 = ""
    str4 = ""
    label = ""
    for aka in aka_patterns:
        if aka in answer:
            a = answer.split(aka, 1)[-1]
            name = a.split()[0]
            surname1 = a.split()[1]
            surname2 = a.split()[2]
            surname3 = a.split()[3]
            str1 = name
            str2 = name + " " + surname1
            str3 = name + " " + surname1 + " " + surname2
            str4 = name + " " + surname1 + " " + surname2 + " " + surname3

    tokens = nltk.word_tokenize(str4)
    pos = nltk.pos_tag(tokens)

    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] not in {"NNP", "NN"}):
        label = str1
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] not in {"NNP", "NN"}):
        label = str2
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] in {"NNP", "NN"} and pos[3][1] not in {"NNP", "NN"}):
        label = str3
    if (pos[0][1] in {"NNP", "NN"} and pos[1][1] in {"NNP", "NN"}
            and pos[2][1] in {"NNP", "NN"} and pos[3][1] in {"NNP", "NN"}):
        label = str4

    for aka in aka_patterns:
        rulerAlKnAs.add_patterns([{"label": label, "pattern": aka}])
    rulerAlKnAs.name = 'rulerAlKnAs'
    nlp.add_pipe(rulerAlKnAs)
    doc = nlp(answer)
    for ent in doc.ents:
        return (ent.label_)
示例#6
0
def ruler_model(df):
    rulerLocations = EntityRuler(nlp, overrite_ents=True)
    locations = [
        'Mount Kenya', 'Maasai Mara', 'Nyeri', 'Lake Naivasha', 'Mombasa',
        'Nairobi', 'Lake Victoria', 'Mount Elgon', 'Nakuru', 'Kiambu',
        'Olduvai Gorge', 'Zanzibar'
    ]
    for l in locations:
        rulerLocations.add_patterns([{'label': 'LOC', 'pattern': l}])

    rulerLocations.name = 'rulerLocations'
    nlp.add_pipe(rulerLocations)
    docs = nlp.pipe(iter(df['text']))
    for doc in docs:
        for ent in doc.ents:
            if ent.label_ == 'LOC':
                print([ent.text, ent.label_])
示例#7
0
    def addAddressEntity(self, text):
        addrlist = self.getAddressList(text)

        cnt = 0
        for a in addrlist:
            cnt = cnt + 1
            #print(a)
            ruler = EntityRuler(self.nlp, overwrite_ents=True)
            ruler.name = "addr_" + str(cnt)
            pattern = [{"label": "ADDRESS", "pattern": a}]

            ruler.add_patterns(pattern)

            try:
                self.nlp.add_pipe(ruler)
            except ValueError as ve:
                self.nlp.remove_pipe(ruler.name)
                self.nlp.add_pipe(ruler)
def nameofperson(sentence):
    from spacy.pipeline import EntityRuler
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    rulername = EntityRuler(nlp, overwrite_ents=True)

    answer = sentence
    #answer = answer.translate(str.maketrans('','',string.punctuation))

    answer = answer.split('(')[0]
    answer = answer.split(',')[0]

    label = answer

    rulername.add_patterns([{"label": label, "pattern": answer}])
    rulername.name = 'rulername'
    nlp.add_pipe(rulername)
    doc = nlp(answer)
    for ent in doc.ents:
        return (ent.label_)
def birthdate(sentence):

    from spacy.pipeline import EntityRuler
    nlp = spacy.load('en_core_web_sm', disable=['ner'])
    rulerdate = EntityRuler(nlp, overwrite_ents=True)

    answer = sentence

    date_patterns = ['born', '–', '–']
    date = ""

    answer = answer.split('(')[1]
    if '–' in answer:
        a = answer.split(')')[0]
        day = a.split()[0]
        month = a.split()[1]
        year = a.split()[2]
    if 'born' in answer:
        a = answer.split(')')[0]
        day = a.split()[1]
        month = a.split()[2]
        year = a.split()[3]
    if '–' in answer:
        a = answer.split(')')[0]
        day = a.split()[1]
        month = a.split()[2]
        year = a.split()[3]
    date = day + " " + month + " " + year

    label = date

    for date in date_patterns:
        rulerdate.add_patterns([{"label": label, "pattern": date}])
    rulerdate.name = 'rulerdate'
    nlp.add_pipe(rulerdate)
    doc = nlp(answer)
    for ent in doc.ents:
        return (ent.label_)
def extract_json(raw_text):


	# TITLE
	title1 = ["Agreement on contract1"]
	title2 = ["Agreement on contract2"]
	title3 = ["Agreement on contract3"]
	title4 = ["Agreement on contract4"]
	title5 = ["Agreement on contract5"]
	title6 = ["Agreement on contract6"]
	title7 = ["Agreement on contract7"]
	title8 = ["Agreement on contract8"]
	title9 = ["Agreement on contract9"]
	suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
	suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
	suppliers3 = ["Google GmbH", "GOOGLE"]
	suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
	suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
	clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
	dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
	dates2 = ["31. July 2018"]
	dates3 = ["termination after a period of 48 months"]
	dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
	dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
	countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
	countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]

	rulerAll = EntityRuler(nlp, overwrite_ents=True)

	# Add all patterns in respective entity
	# Title

	for tit1 in title1:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

	for tit2 in title2:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

	for tit3 in title3:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

	for tit4 in title4:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

	for tit5 in title5:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

	for tit6 in title6:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

	for tit7 in title7:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

	for tit8 in title8:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

	for tit9 in title9:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

	# for supplier

	for s1 in suppliers1:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

	for s2 in suppliers2:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

	for s3 in suppliers3:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

	for s4 in suppliers4:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

	for s5 in suppliers5:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

	# for clients

	for c1 in clients:
		rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

	# Pattern for DATES

	for t1 in dates1:
		rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

	for t2 in dates2:
		rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

	for t3 in dates3:
		rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

	for t4 in dates4:
		rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

	for t5 in dates5:
		rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

	# for countries

	for count1 in countries1:
		rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

	for count2 in countries2:
		rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

	# Add rulerAll to patterns

	rulerAll = EntityRuler(nlp, overwrite_ents=True)



	# For Title entity

	for tit1 in title1:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

	for tit2 in title2:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

	for tit3 in title3:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

	for tit4 in title4:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

	for tit5 in title5:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

	for tit6 in title6:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

	for tit7 in title7:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

	for tit8 in title8:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

	for tit9 in title9:
		rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

	# for supplier

	for s1 in suppliers1:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

	for s2 in suppliers2:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

	for s3 in suppliers3:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

	for s4 in suppliers4:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

	for s5 in suppliers5:
		rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])


	# for clients

	for c1 in clients:
		rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

	# Pattern for DATES

	for t1 in dates1:
		rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

	for t2 in dates2:
		rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

	for t3 in dates3:
		rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

	for t4 in dates4:
		rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

	for t5 in dates5:
		rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

	# for countries

	for count1 in countries1:
		rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

	for count2 in countries2:
		rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])


	# HOLISTIC NAME FOR RULER
	rulerAll.name = 'rulerAll'
	if 'rulerAll' not in nlp.pipe_names:
	 	nlp.add_pipe(rulerAll)



	with nlp.disable_pipes('ner'):
		doc = nlp(raw_text)
	threshold = 0.2
	beams = nlp.entity.beam_parse([doc], beam_width=3, beam_density=0.0001)
	entity_scores = defaultdict(float)
	for beam in beams:
		for score, ents in nlp.entity.moves.get_beam_parses(beam):
			for start, end, label in ents:
				entity_scores[(start, end, label)] += score
	ent_custom = []
	ent_label = []
	ent_score = []
	for key in entity_scores:
		start, end, label = key
		score = entity_scores[key]
		if (score > threshold):
			ent_custom.append(label)
			ent_label.append(str(doc[start:end]))
			ent_score.append(score)
	df_ent_score = pd.DataFrame({'ENT_DETECT': [], 'ENT_LABEL': [], 'CONFIDENCE': []})
	df_ent_score['ENT_DETECT']=ent_custom
	df_ent_score['ENT_LABEL']=ent_label
	df_ent_score['CONFIDENCE']=ent_score
	df_custom_ent=df_ent_score[(df_ent_score.ENT_DETECT=="TITLE") | (df_ent_score.ENT_DETECT=="CLIENT") |(df_ent_score.ENT_DETECT=="SUPPLIER")
		| (df_ent_score.ENT_DETECT=="COUNTRIES")| (df_ent_score.ENT_DETECT=="Effective-DATES")| (df_ent_score.ENT_DETECT=="Signature-DATES")
		| (df_ent_score.ENT_DETECT=="Termination-DATES")| (df_ent_score.ENT_DETECT=="Commencement-DATES")| (df_ent_score.ENT_DETECT=="END-DATES")
		| (df_ent_score.ENT_DETECT=="CLIENT_CONTRACT_MANAGER")| (df_ent_score.ENT_DETECT=="SUPPLIER_CONTRACT_MANAGER")
		| (df_ent_score.ENT_DETECT=="DATE")]
	df_ent_dup=df_custom_ent.copy()
	df_ent_dup = df_ent_dup.drop_duplicates(subset=["ENT_DETECT"])
	df_ent_dup=df_ent_dup.reset_index(drop=True)
	df_ent_dup.index = df_ent_dup.index + 1
	json_table = df_ent_dup.to_json(orient='index')

	return json_table
示例#11
0
     datesOrdered = dates == sorted(dates, reverse=True)
 else:
     noDates = True  # Dates Present
 if dtFormat == True:
     dateScore += 3
 if datesOrdered == True:
     dateScore += 1
 if noDates == False:
     dateScore += 6
 skill_ruler = EntityRuler(
     nlp, overwrite_ents=True).from_disk('./skill_patterns.jsonl')
 verb_ruler = EntityRuler(
     nlp, overwrite_ents=True).from_disk('./actionVerbs.jsonl')
 coms_ruler = EntityRuler(nlp,
                          overwrite_ents=True).from_disk('./coms.jsonl')
 skill_ruler.name = 'skillRuler'
 verb_ruler.name = 'verbRuler'
 coms_ruler.name = 'comsRuler'
 nlp.add_pipe(skill_ruler, after='parser')
 skillset_dict = create_skillset_dict([nlp(resume_text)])
 nlp.disable_pipes('skillRuler')
 nlp.add_pipe(verb_ruler, after='parser')
 wordset_dict_verb = create_skillset_dict_verb([nlp(resume_text)])
 wordset_dict_sumr = create_skillset_dict_verb(
     [nlp(' '.join(resume_raw_text.splitlines()[:16]))])
 nlp.disable_pipes('verbRuler')
 nlp.add_pipe(coms_ruler, after='parser')
 wordset_dict_coms = create_skillset_dict_verb([nlp(resume_text)])
 # vacature_skillset = create_skill_set(nlp(resume_text))
 repetition = []
 singles = []
示例#12
0
def main():
    """Summary AND NER App"""

    st.title("LEGAL TECH")

    activities = ['Extract MetaData From TEXT',
                  "Extract Metadata From Text File",
                  "Extract MetaData from .docx File",
                  "Extract MetaData from .pdf File",
                  "Find key entities in document"]
    choice = st.sidebar.selectbox("Select Activity", activities)

    if choice == 'Extract MetaData From TEXT':
        st.subheader("Extract MetaData from Legal Documents")
        raw_text = st.text_area("Enter Text Here", "Type Here")

        # Read text as spacy token instant

        def analyze_text(text):
            return nlp(text)

        # Add all CUSTOM patterns in respective entity
        # PATTERN FOR TITLES

        # Applying NLP ideas

        title1 = ["Agreement on contract1"]
        title2 = ["Agreement on contract2"]
        title3 = ["Agreement on contract3"]
        title4 = ["Agreement on contract4"]
        title5 = ["Agreement on contract5"]
        title6 = ["Agreement on contract6"]
        title7 = ["Agreement on contract7"]
        title8 = ["Agreement on contract8"]
        title9 = ["Agreement on contract9"]
        suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
        suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
        suppliers3 = ["Google GmbH", "GOOGLE"]
        suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
        suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
        clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
        dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
        dates2 = ["31. July 2018"]
        dates3 = ["termination after a period of 48 months"]
        dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
        dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
        countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
        countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]


        # Define rulerAll for all entities
        rulerAll = EntityRuler(nlp, overwrite_ents=True)

        # Add all patterns in respective entity
        for tit1 in title1:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

        for tit2 in title2:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

        for tit3 in title3:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

        for tit4 in title4:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

        for tit5 in title5:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

        for tit6 in title6:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

        for tit7 in title7:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

        for tit8 in title8:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

        for tit9 in title9:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

        # for supplier

        for s1 in suppliers1:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

        for s2 in suppliers2:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

        for s3 in suppliers3:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

        for s4 in suppliers4:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

        for s5 in suppliers5:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

        # for clients

        for c1 in clients:
            rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

        # Pattern for DATES

        for t1 in dates1:
            rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

        for t2 in dates2:
            rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

        for t3 in dates3:
            rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

        for t4 in dates4:
            rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

        for t5 in dates5:
            rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

        # for countries

        for count1 in countries1:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

        for count2 in countries2:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

        # Define Ruler for All

        rulerAll.name = 'rulerAll'
        nlp.add_pipe(rulerAll)

        # Getting text input

        if st.button("Extract"):
            docx2 = analyze_text(raw_text)
            html = displacy.render(docx2, style="ent")
            html = html.replace("\n\n", "\n")
            # st.write(html, unsafe_allow_html=True)
            st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)

# FOR SECOND CHALLENGE....reading from file

    if choice == 'Extract Metadata From Text File':
        st.subheader("Extract MetaData from Given Text File")
        def file_selector(folder_path='.'):
            filenames = os.listdir(folder_path)
            selected_filename = st.selectbox('Please only Select a Text File', filenames)
            return os.path.join(folder_path, selected_filename)

        filename = file_selector()


        f = open(filename)
        st.write('You have selected `%s`' % filename)

        raw=f.read()
        raw_text2 = st.text_area("your file contains following text", raw)  # for storing in raw text

        # DEFINE ANALYISIS FUNCTION

        def analyze_text(text):
            return nlp(text)

        # Applying NLP ideas

        title1 = ["Agreement on contract1"]
        title2 = ["Agreement on contract2"]
        title3 = ["Agreement on contract3"]
        title4 = ["Agreement on contract4"]
        title5 = ["Agreement on contract5"]
        title6 = ["Agreement on contract6"]
        title7 = ["Agreement on contract7"]
        title8 = ["Agreement on contract8"]
        title9 = ["Agreement on contract9"]
        suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
        suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
        suppliers3 = ["Google GmbH", "GOOGLE"]
        suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
        suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
        clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
        dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
        dates2 = ["31. July 2018"]
        dates3 = ["termination after a period of 48 months"]
        dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
        dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
        countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
        countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]

        # Define rulerAll for all entities
        rulerAll = EntityRuler(nlp, overwrite_ents=True)

        # Add all patterns in respective entity
        for tit1 in title1:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

        for tit2 in title2:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

        for tit3 in title3:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

        for tit4 in title4:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

        for tit5 in title5:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

        for tit6 in title6:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

        for tit7 in title7:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

        for tit8 in title8:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

        for tit9 in title9:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

        # for supplier

        for s1 in suppliers1:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

        for s2 in suppliers2:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

        for s3 in suppliers3:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

        for s4 in suppliers4:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

        for s5 in suppliers5:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

        # for clients

        for c1 in clients:
            rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

        # Pattern for DATES

        for t1 in dates1:
            rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

        for t2 in dates2:
            rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

        for t3 in dates3:
            rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

        for t4 in dates4:
            rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

        for t5 in dates5:
            rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

        # for countries

        for count1 in countries1:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

        for count2 in countries2:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

        # Define Ruler for All

        rulerAll.name = 'rulerAll'
        nlp.add_pipe(rulerAll)

        # analysis from loaded file

        if st.button("Extract"):
            document = analyze_text(raw_text2)
            html = displacy.render(document, style="ent")
            html = html.replace("\n\n", "\n")
            # st.write(html, unsafe_allow_html=True)
            st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)

    # FOR THIRD CHALLENGE....reading from file

    if choice == 'Extract MetaData from .docx File':
        st.subheader("Extract MetaData from .docx File")

        def file_selector(folder_path='.'):
            filenames = os.listdir(folder_path)
            selected_filename = st.selectbox('Please only Select a .docx File', filenames)
            return os.path.join(folder_path, selected_filename)

        filename = file_selector()

        f = open(filename)
        st.write('You have selected `%s`' % filename)
        docx_file = docx2txt.process(filename)
        #raw = docx_file.read()
        raw_text3 = st.text_area("your file contains following text", docx_file)  # for storing in raw text

        # DEFINE ANALYISIS FUNCTION

        def analyze_text(text):
            return nlp(text)

        # Applying NLP ideas

        title1 = ["Agreement on contract1"]
        title2 = ["Agreement on contract2"]
        title3 = ["Agreement on contract3"]
        title4 = ["Agreement on contract4"]
        title5 = ["Agreement on contract5"]
        title6 = ["Agreement on contract6"]
        title7 = ["Agreement on contract7"]
        title8 = ["Agreement on contract8"]
        title9 = ["Agreement on contract9"]
        suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
        suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
        suppliers3 = ["Google GmbH", "GOOGLE"]
        suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
        suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
        clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
        dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
        dates2 = ["31. July 2018"]
        dates3 = ["termination after a period of 48 months"]
        dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
        dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
        countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
        countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]

        # Define rulerAll for all entities
        rulerAll = EntityRuler(nlp, overwrite_ents=True)

        # Add all patterns in respective entity
        for tit1 in title1:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

        for tit2 in title2:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

        for tit3 in title3:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

        for tit4 in title4:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

        for tit5 in title5:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

        for tit6 in title6:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

        for tit7 in title7:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

        for tit8 in title8:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

        for tit9 in title9:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

        # for supplier

        for s1 in suppliers1:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

        for s2 in suppliers2:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

        for s3 in suppliers3:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

        for s4 in suppliers4:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

        for s5 in suppliers5:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

        # for clients

        for c1 in clients:
            rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

        # Pattern for DATES

        for t1 in dates1:
            rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

        for t2 in dates2:
            rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

        for t3 in dates3:
            rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

        for t4 in dates4:
            rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

        for t5 in dates5:
            rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

        # for countries

        for count1 in countries1:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

        for count2 in countries2:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

        # Define Ruler for All

        rulerAll.name = 'rulerAll'
        nlp.add_pipe(rulerAll)


    # analysis from loaded file


        if st.button("Extract"):

            document3 = analyze_text(raw_text3)
            html = displacy.render(document3, style="ent")
            html = html.replace("\n\n", "\n")
            st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)


    # FOR FOURTH CHALLENGE....reading from PDF file

    if choice == 'Extract MetaData from .pdf File':
        st.subheader("Extract MetaData from .pdf File")

        def file_selector(folder_path='.'):
            filenames = os.listdir(folder_path)
            selected_filename = st.selectbox('Please only Select a .docx File', filenames)
            return os.path.join(folder_path, selected_filename)

        filename = file_selector()

        f = open(filename)
        st.write('You have selected `%s`' % filename)

        with open(filename, "rb") as pdf:
            pdf_file = pdftotext.PDF(pdf)
        # converting pdftotext.PDF to string type
        pdf_text = ("\n\n".join(pdf_file))
        raw_text4 = st.text_area("your file contains following text", pdf_text)  # for storing in raw text

        # DEFINE ANALYISIS FUNCTION

        def analyze_text(text):
            return nlp(text)
        # Applying NLP ideas

        title1 = ["Agreement on contract1"]
        title2 = ["Agreement on contract2"]
        title3 = ["Agreement on contract3"]
        title4 = ["Agreement on contract4"]
        title5 = ["Agreement on contract5"]
        title6 = ["Agreement on contract6"]
        title7 = ["Agreement on contract7"]
        title8 = ["Agreement on contract8"]
        title9 = ["Agreement on contract9"]
        suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
        suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
        suppliers3 = ["Google GmbH", "GOOGLE"]
        suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
        suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
        clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
        dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
        dates2 = ["31. July 2018"]
        dates3 = ["termination after a period of 48 months"]
        dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
        dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
        countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
        countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]

        # Define rulerAll for all entities
        rulerAll = EntityRuler(nlp, overwrite_ents=True)

        # Add all patterns in respective entity
        for tit1 in title1:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

        for tit2 in title2:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

        for tit3 in title3:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

        for tit4 in title4:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

        for tit5 in title5:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

        for tit6 in title6:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

        for tit7 in title7:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

        for tit8 in title8:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

        for tit9 in title9:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

        # for supplier

        for s1 in suppliers1:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

        for s2 in suppliers2:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

        for s3 in suppliers3:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

        for s4 in suppliers4:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

        for s5 in suppliers5:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

        # for clients

        for c1 in clients:
            rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

        # Pattern for DATES

        for t1 in dates1:
            rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

        for t2 in dates2:
            rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

        for t3 in dates3:
            rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

        for t4 in dates4:
            rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

        for t5 in dates5:
            rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

        # for countries

        for count1 in countries1:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

        for count2 in countries2:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

        # Define Ruler for All

        rulerAll.name = 'rulerAll'
        nlp.add_pipe(rulerAll)

        # analysis from loaded file

        if st.button("Extract"):
            document4 = analyze_text(raw_text4)
            html = displacy.render(document4, style="ent")
            html = html.replace("\n\n", "\n")
            st.markdown(HTML_WRAPPER.format(html), unsafe_allow_html=True)

        # FOR FIFTH CHALLENGE....reading from file

    if choice == 'Find key entities in document':
        st.subheader("Extract MetaData from Given Text File")

        def file_selector(folder_path='.'):
            filenames = os.listdir(folder_path)
            selected_filename = st.selectbox('Please only Select a Text File', filenames)
            return os.path.join(folder_path, selected_filename)

        filename = file_selector()

        f = open(filename)
        st.write('You have selected `%s`' % filename)

        raw = f.read()
        raw_text5 = st.text_area("your file contains following text", raw)  # for storing in raw text

        # DEFINE ANALYISIS FUNCTION

        def analyze_text(text):
            return nlp(text)

        # Applying NLP ideas

        title1 = ["Agreement on contract1"]
        title2 = ["Agreement on contract2"]
        title3 = ["Agreement on contract3"]
        title4 = ["Agreement on contract4"]
        title5 = ["Agreement on contract5"]
        title6 = ["Agreement on contract6"]
        title7 = ["Agreement on contract7"]
        title8 = ["Agreement on contract8"]
        title9 = ["Agreement on contract9"]
        suppliers1 = ["FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG", "Facebook Global Invest AG"]
        suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
        suppliers3 = ["Google GmbH", "GOOGLE"]
        suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
        suppliers5 = ["AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED", "Amazon Limited"]
        clients = ["BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH", "BOL.com Business Services GmbH"]
        dates1 = ["29 September 2018", "01 January 2015", "01.07.2018", " August 2017"]
        dates2 = ["31. July 2018"]
        dates3 = ["termination after a period of 48 months"]
        dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
        dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
        countries1 = ["UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary", "India"]
        countries2 = ["Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines", "Romania"]

        # Define rulerAll for all entities
        rulerAll = EntityRuler(nlp, overwrite_ents=True)

        # Add all patterns in respective entity
        for tit1 in title1:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

        for tit2 in title2:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

        for tit3 in title3:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

        for tit4 in title4:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

        for tit5 in title5:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

        for tit6 in title6:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

        for tit7 in title7:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

        for tit8 in title8:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

        for tit9 in title9:
            rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

        # for supplier

        for s1 in suppliers1:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

        for s2 in suppliers2:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

        for s3 in suppliers3:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

        for s4 in suppliers4:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

        for s5 in suppliers5:
            rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

        # for clients

        for c1 in clients:
            rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

        # Pattern for DATES

        for t1 in dates1:
            rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

        for t2 in dates2:
            rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

        for t3 in dates3:
            rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

        for t4 in dates4:
            rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

        for t5 in dates5:
            rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

        # for countries

        for count1 in countries1:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

        for count2 in countries2:
            rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

         # Define Ruler for All

        rulerAll.name = 'rulerAll'
        nlp.add_pipe(rulerAll)

        # analysis from loaded file

        if st.button("Extract"):
            doc5 = analyze_text(raw_text5)
            # Adding a Dataframe
            df_ent = pd.DataFrame({'TEXT': [], 'ENT_LABEL': [], 'START': [], 'END': []})
            df_ent['TEXT'] = [x.text for x in doc5.ents]
            df_ent['ENT_LABEL'] = [x.label_ for x in doc5.ents]
            df_ent['START'] = [x.start_char for x in doc5.ents]
            df_ent['END'] = [x.end_char for x in doc5.ents]
            # Filering entities to be shown
            df_table = df_ent[
                (df_ent.ENT_LABEL == "TITLE") | (df_ent.ENT_LABEL == "CLIENT") | (df_ent.ENT_LABEL == "SUPPLIER")
                | (df_ent.ENT_LABEL == "COUNTRIES") | (df_ent.ENT_LABEL == "Effective-DATES") | (
                            df_ent.ENT_LABEL == "Signature-DATES")
                | (df_ent.ENT_LABEL == "Termination-DATES") | (df_ent.ENT_LABEL == "Commencement-DATES") | (
                            df_ent.ENT_LABEL == "END-DATES")
                | (df_ent.ENT_LABEL == "CLIENT_CONTRACT_MANAGER") | (df_ent.ENT_LABEL == "SUPPLIER_CONTRACT_MANAGER")]
            df_ent_dup = df_table.copy()
            df_ent_dup = df_ent_dup.drop_duplicates(subset=["ENT_LABEL"])
            # Output shown on app
            st.write('Key Entities Found in Contract : ', df_ent_dup)
示例#13
0
	rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

for t5 in dates5:
	rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

# for countries

for count1 in countries1:
    rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

for count2 in countries2:
    rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

# Define Ruler for All

rulerAll.name = 'rulerAll'
nlp.add_pipe(rulerAll)




HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem">{}</div>"""

from flaskext.markdown import Markdown

app = Flask(__name__)
Markdown(app)


# def analyze_text(text):
# 	return nlp(text)
示例#14
0
def extracts(raw_text):

    nlp = spacy.load('en_core_web_sm', disable=['ner'])

    title1 = ["Agreement on contract1"]
    title2 = ["Agreement on contract2"]
    title3 = ["Agreement on contract3"]
    title4 = ["Agreement on contract4"]
    title5 = ["Agreement on contract5"]
    title6 = ["Agreement on contract6"]
    title7 = ["Agreement on contract7"]
    title8 = ["Agreement on contract8"]
    title9 = ["Agreement on contract9"]
    suppliers1 = [
        "FACEBOOK", "Facebook", "FACEBOOK GLOBAL INVEST AG",
        "Facebook Global Invest AG"
    ]
    suppliers2 = ["BIRD", "BIRDS Deutschland GmbH", "Birds Deutschland GmbH"]
    suppliers3 = ["Google GmbH", "GOOGLE"]
    suppliers4 = ["EBAY Deutschland AG", "EBAY", "ebay"]
    suppliers5 = [
        "AMAZON SERVICES GMBH", "Amazon Services GmbH", "AMAZON LIMITED",
        "Amazon Limited"
    ]
    clients = [
        "BOL.com", "bol.com", "BOL.COM BUSINESS SERVICES GMBH",
        "BOL.com Business Services GmbH"
    ]
    dates1 = [
        "29 September 2018", "01 January 2015", "01.07.2018", " August 2017"
    ]
    dates2 = ["31. July 2018"]
    dates3 = ["termination after a period of 48 months"]
    dates4 = ["31.01.2017", "31.03.2019", "1 October 2018"]
    dates5 = ["31.12.2018", "Apr 11th 2023", "19.01.2020"]
    countries1 = [
        "UK", "Germany", "France", "Italy", "Netherlands", "Russia", "Hungary",
        "India"
    ]
    countries2 = [
        "Slovakia", "Czech", "Australia", "Vietnam", "Japan", "Philippines",
        "Romania"
    ]

    rulerAll = EntityRuler(nlp, overwrite_ents=True)

    # Add all patterns in respective entity
    # Title

    for tit1 in title1:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

    for tit2 in title2:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

    for tit3 in title3:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

    for tit4 in title4:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

    for tit5 in title5:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

    for tit6 in title6:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

    for tit7 in title7:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

    for tit8 in title8:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

    for tit9 in title9:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

    # for supplier

    for s1 in suppliers1:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

    for s2 in suppliers2:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

    for s3 in suppliers3:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

    for s4 in suppliers4:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

    for s5 in suppliers5:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

    # for clients

    for c1 in clients:
        rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

    # Pattern for DATES

    for t1 in dates1:
        rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

    for t2 in dates2:
        rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

    for t3 in dates3:
        rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

    for t4 in dates4:
        rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

    for t5 in dates5:
        rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

    # for countries

    for count1 in countries1:
        rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

    for count2 in countries2:
        rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

    # Add rulerAll to patterns

    rulerAll = EntityRuler(nlp, overwrite_ents=True)

    # For Title entity

    for tit1 in title1:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit1}])

    for tit2 in title2:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit2}])

    for tit3 in title3:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit3}])

    for tit4 in title4:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit4}])

    for tit5 in title5:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit5}])

    for tit6 in title6:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit6}])

    for tit7 in title7:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit7}])

    for tit8 in title8:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit8}])

    for tit9 in title9:
        rulerAll.add_patterns([{"label": "TITLE", "pattern": tit9}])

    # for supplier

    for s1 in suppliers1:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s1}])

    for s2 in suppliers2:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s2}])

    for s3 in suppliers3:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s3}])

    for s4 in suppliers4:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s4}])

    for s5 in suppliers5:
        rulerAll.add_patterns([{"label": "SUPPLIER", "pattern": s5}])

    # for clients

    for c1 in clients:
        rulerAll.add_patterns([{"label": "CLIENT", "pattern": c1}])

    # Pattern for DATES

    for t1 in dates1:
        rulerAll.add_patterns([{"label": "Effective-DATES", "pattern": t1}])

    for t2 in dates2:
        rulerAll.add_patterns([{"label": "Signature-DATES", "pattern": t2}])

    for t3 in dates3:
        rulerAll.add_patterns([{"label": "Termination-DATES", "pattern": t3}])

    for t4 in dates4:
        rulerAll.add_patterns([{"label": "Commencement-DATES", "pattern": t4}])

    for t5 in dates5:
        rulerAll.add_patterns([{"label": "END-DATES", "pattern": t5}])

    # for countries

    for count1 in countries1:
        rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count1}])

    for count2 in countries2:
        rulerAll.add_patterns([{"label": "COUNTRIES", "pattern": count2}])

    # HOLISTIC NAME FOR RULER
    rulerAll.name = 'rulerAll'
    if 'rulerAll' not in nlp.pipe_names:
        nlp.add_pipe(rulerAll)

# take raw text data
    docx = nlp(raw_text)
    html = displacy.render(docx, style="ent")
    html = html.replace("\n\n", "\n")
    result = HTML_WRAPPER.format(html)

    return result