def main(): # define test input files tender_txt = 'io/input/marq/tender.txt' tender_csv = 'io/input/marq/tender.csv' erp_csv = 'io/input/test/erp10_master.csv' # -------------------------------------- # test loading and cleaning a document # function path: preprocessor > loader.py > load_doc(d) print('\nHere\'s the input doc after initial loading:\n') d = loader.load_doc(tender_csv) print(d) print('\nHere\'s the input doc after string cleaning:\n') d = string_cleaner.clean_doc(d) print(d) # rem in matcher if distanceEncoder.levenshtein(d, d1) == 0 then 100% match # send object to the nlp object processor nlp_object_processor.process_nlp_object(d) # end program print('\nDone')
def main(): stopSuppliers = False # get iesa descriptions from external file p_strings = [] with open('iesa_short_descriptions.txt', 'r') as infile: for line in infile: p_strings.append(re.sub(r'^\s+$', '', line)) # get suppliers and iesa stop words from external file and add to iesa_stop_words[] iesa_stop_words = [] if stopSuppliers == True: with open('iesa_suppliers.txt', 'r') as infile: for word in infile: iesa_stop_words.append(word.lower().strip()) with open('iesa_stop_words.txt', 'r') as infile: for word in infile: iesa_stop_words.append(word.lower().strip()) # program controller ------------------------------------------------------ nlp_switch = 'on' nlp_features = 't' # s, t, st, nc, m -> sents, toks, sents + toks, noun-chunks, mmat # -------------------------------------------------------------------------- # build the string to use for nlp based on iesa short description nlpstr = '' i = 0 for item in p_strings: nlpstr = nlpstr + 'root ' + p_strings[i] + ' ' i += 1 nlpstr = string_cleaner.clean_doc(nlpstr) #nlpstr = nlpstr.lower() #nlpstr = nlpstr.strip() # remove leading and trailing whitespace #nlpstr = re.sub(' +', ' ', nlpstr) # remove duplicative whitespace spacer = '----------------------' #print(spacer) #print(nlpstr) ''' STRING TESTING ''' # -------------------------------------------------------------------------- def nlp_stuff(nlpstr, feature): # make nlp object nlp = spacy.load('en_core_web_sm', disable=['parser']) nlp.add_pipe(sentence_segmenter, before='ner') merge_nchunks = nlp.create_pipe('merge_noun_chunks') nlp.add_pipe(merge_nchunks) # add iesa stop words for word in iesa_stop_words: nlp.vocab[word].is_stop = True # create nlp object d = nlp(nlpstr) print(nlp.pipe_names) # test print # print sentences to console if feature == 's' or feature == 'st': for sent in d.sents: for tok in sent: if tok.is_stop == False: print(tok.text, end=' ') print('\n') # print tokens to console if feature == 't' or feature == 'st': str = '' #print(spacer) for sent in d.sents: for tok in sent: if tok.is_stop == False: #print(tok.text, tok.pos_, tok.tag_) str = str + tok.text + ' ' print('STRING: ', str) #if str.find('rms22') >= 0: #print('EUREKA!!!!!!!!!!!!!!') # print noun chunks to console if feature == 'nc': print(spacer) for nc in d.noun_chunks: print(nc.text) print(spacer) # print mMat to console if feature == 'm': print(spacer) for sent in d.sents: for tok in sent: if tok.is_stop == False and tok.text != 'root' and tok.pos_ in [ 'NOUN', 'NUM', 'ADJ' ]: print(tok.text, tok.pos_, tok.tag_) print(spacer) # -------------------------------------------------------------------------- if nlp_switch == 'on': nlp_stuff(nlpstr, nlp_features) # end program print('\nDone.')
def main(): ''' NERS Demo IESA Sample Data ''' # CONFIG ------------------------ model = 'pre' # pre -> use non-trained model / post -> use trained model ruler = 'off' cleaner = 'off' number_tagger = 'off' stemmer = 'off' # -------------------------------- # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser' ]) #('en_core_web_sm', disable=['parser']) elif model == 'post': update_meta_json.update_meta() nlp = spacy.load('model') # add pipes if ruler == 'on': if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk('ners_patterns_all.jsonl') # putting the ner before ruler will override favor ner decisions if model == 'pre': nlp.add_pipe(nu_ruler, before='ner') else: nlp.add_pipe(nu_ruler, after='ner') nlp.add_pipe(sentence_segmenter, before='ner') # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv('iesa_tender.csv') # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # this nlp tender obj is cleaned -- pickle it make_pickle(doc) # CONSOLE OUTPUT print('\n') labels = ['SUPPLIER', 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Supplier', 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('model') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- suppliers = [] products = [] skus = [] mpns = [] # print(doc) for ent in doc.ents: if ent.label_ in labels: if ent.label_ == 'SUPPLIER': suppliers.append([ent.label_, ent.text]) elif ent.label_ == 'PRODUCT': products.append([ent.label_, ent.text]) elif ent.label_ == 'SKU': skus.append([ent.label_, ent.text]) elif ent.label_ == 'MPN': mpns.append([ent.label_, ent.text]) print('--------------------------') for i in suppliers: print(i) print('--------------------------') for i in products: print(i) print('--------------------------') for i in mpns: print(i) print('--------------------------') for i in skus: print(i) # DISPLACY VISUALIZER # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format( item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'IESA Named Entities Found in Tender\n' doc = nlp(header + spacer + results + spacer + tender) html = displacy.render(doc, style="ent", page=True) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/xampp/htdocs/mySites/wrWx_NERS/index.html', 'w') as data: data.write(html) print('\n' + results) ''' for sent in doc.sents: print(sent) for tok in doc: if tok.is_stop == False: print([tok.text, tok.pos_, tok.tag_]) ''' # TEST ----------------------------- # end program print('Done.')
def main(): ''' NERS Demo w/ Sample Data ''' # CONFIG ---------------------- \\ # -------------------------------- \\ model = 'pre' # pre -> use non-trained model / post -> use trained model mmat = 'off' # on/off manuf = 'on' # on/off ruler = 'on' cleaner = 'on' number_tagger = 'off' # rem if stemmer is turned on after model does P2 training, then # you will need to use POS tag to detect nouns in products # then create new generator patterns for all.json # then run entity ruler again stemmer = 'off' # declare inputs / outputs manuf_pandas_file = r'C:\Users\stacy\My GitHub\wxMatchingEngine\store\model\brmr_erp1\hayley\out_manuf_pandas.xlsx' # output mmat_pandas_file = r'C:\Users\stacy\My GitHub\wxMatchingEngine\store\model\brmr_erp1\hayley\out_mmat_pandas.xlsx' # output mmat_file = r'C:\Users\stacy\My GitHub\wxMatchingEngine\store\model\brmr_erp1\hayley\out_mmat_patterns.jsonl' # input manuf_file = r'C:\Users\stacy\My GitHub\wxMatchingEngine\store\model\brmr_erp1\hayley\out_manuf_patterns.jsonl' # input if mmat == 'off' and manuf == 'off': patterns_file = mmat_file if mmat == 'on' and manuf == 'off': patterns_file = mmat_file elif mmat == 'off' and manuf == 'on': patterns_file = manuf_file elif mmat == 'on' and manuf == 'on': patterns_file = combine_pattern_files(mmat_file, manuf_file) tender_file = r'C:\Users\stacy\My GitHub\wxMatchingEngine\store\model\brmr_erp1\hayley\in_description2.csv' #output_file = 'demo_ners_output_nonstock.txt' write_type = 'w' # -------------------------------- // # ------------------------------ // # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner' ]) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('model_entRuler') nlp.add_pipe(sentence_segmenter, after='tagger') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler) #, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler) #, before='ner') # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # CONSOLE OUTPUT --------------------------------------------------------- if mmat == 'on' and manuf == 'off': print('\n') labels = ['MMAT'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Mmat'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) if mmat == 'off' and manuf == 'on': print('\n') labels = ['MANUF'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Manuf'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) if mmat == 'on' and manuf == 'on': print('\n') labels = ['MANUF', 'MMAT'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Manuf', 'Mmat'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # pandas output for mmats ------------------------------------------------ # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis if mmat == 'on': w_MmatCodes = [] w_MmatCode_Alts = [] unique = [] mmat = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'MMAT': if i == 0: # if it's the first label in the record, save it in mmats mmat = ent.text unique.append(ent.text) i += 1 else: # if it's not the first label in the sentence, put it in mmat alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_MmatCodes.append(mmat.upper()) w_MmatCode_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_MmatCodes: ', w_MmatCodes) print('str ', j, 'w_MmatCode_Alts: ', w_MmatCode_Alts) # test --------------- # reset vars for next record unique.clear() mmat = '' alts = '' j += 1 df = pd.DataFrame({ 'w_MmatCodes': w_MmatCodes, 'w_MmatCode_Alts': w_MmatCode_Alts }) writer = pd.ExcelWriter(mmat_pandas_file) df.to_excel(writer, 'NERS_MMATs', index=False) writer.save() # pandas output for manufs ------------------------------------------------ # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis if manuf == 'on': w_Manufs = [] w_Manuf_Alts = [] unique = [] manuf_val = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'MANUF': if i == 0: # if it's the first label in the record, save it in manuf manuf_val = ent.text unique.append(ent.text) i += 1 else: # if it's not the first label in the sentence, put it in manuf alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_Manufs.append(manuf_val.upper()) w_Manuf_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_Manufs: ', w_Manufs) print('str ', j, 'w_Manuf_Alts: ', w_Manuf_Alts) # test --------------- # reset vars for next record unique.clear() manuf_val = '' alts = '' j += 1 df2 = pd.DataFrame({ 'w_Manufs': w_Manufs, 'w_Manuf_Alts': w_Manuf_Alts }) writer2 = pd.ExcelWriter(manuf_pandas_file) df2.to_excel(writer2, 'NERS_Manufs', index=False) writer2.save() # save the model -------------------------------------------------------- # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('model_entRuler') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- mmats = [] # DISPLACY VISUALIZER ----------------------------------------------------- # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format( item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'Named Entities Found in Tender\n' doc = nlp(header + spacer + results + spacer + tender) colors = { "MMAT": "#C3FFA1", "MANUF": "#FFDDA1", } options = {"ents": ["MMAT", "MANUF"], "colors": colors} # displacy.serve(doc, style="ent", options=options) html = displacy.render(doc, style="ent", page=True, options=options) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/Users/stacy/My Localhost/index.html', 'w') as data: data.write(html) print('\n' + results) # end program print('Done.')
def main(): # CONFIG ------------------------ model = 'pre' # pre -> use non-trained model / post -> use trained model ruler = 'on' cleaner = 'on' number_tagger = 'off' stemmer = 'off' # IO ---------------------------- patterns_file = 'demo_ners_patterns_mmat_test.jsonl' tender_file = 'demo_ners_descriptions_nonstock_test.csv' # iesa descriptions pandas_file = 'demo_ners_output_nonstock_mmat_pandas.xlsx' # output_file = 'demo_ners_output_nonstock.txt' write_type = 'w' # -------------------------------- # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner' ]) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('model_entRuler') nlp.add_pipe(sentence_segmenter, after='tagger') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler) #, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler) #, before='ner') # write tagger into pipeline in the meta json file # STOPPED HERE ------------------------ # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # GENERATE CONSOLE OUTPUT print('\n') labels = ['MMAT'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['MMat'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # GENERATE MODEL if ruler == "on": output_dir = Path('model_entRuler') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- #for sent in doc.sents: # print(sent) # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis w_ManfCodes = [] w_ManfCode_Alts = [] unique = [] mmat = '' alts = '' #ent_exists = False j = 0 for sent in doc.sents: i = 0 for ent in sent.ents: # ignore header record if j > 0: if ent.label_ == 'MMAT': if i == 0: # if it's the firs manuf in the record, put it in w_Manufs manuf = ent.text unique.append(ent.text) i += 1 else: # if it's not the first manuf in the sentence, put it in alts # (if it is already in alts, don't put it in) if ent.text not in unique: unique.append(ent.text) if alts == '': alts = ent.text else: alts = alts + ', ' + ent.text #print(ent.label_, ': ', ent.text) # store ent results for each record, ignoring the headers if j > 0: w_ManfCodes.append(mmat.upper()) w_ManfCode_Alts.append(alts.upper()) # test --------------- print('str ', j, 'w_ManfCodes: ', w_ManfCodes) print('str ', j, 'w_ManfCode_Alts: ', w_ManfCode_Alts) # test --------------- # reset vars for next record unique.clear() mmat = '' alts = '' j += 1 df = pd.DataFrame({ 'w_ManfCodes': w_ManfCodes, 'w_ManfCode_Alts': w_ManfCode_Alts }) writer = pd.ExcelWriter(pandas_file) df.to_excel(writer, 'NERS_Manufs', index=False) writer.save() #for sent in doc.sents: print(sent) print('nu----------------') #for sent in doc.sents: # print(sent) # GENERATE DISPLACY VISUALIZER # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format( item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'IESA Named Entities Found in Tender\n' doc = nlp(header + spacer + results + spacer + tender) html = displacy.render(doc, style="ent", page=True) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/Users/stacy/My Localhost/index.html', 'w') as data: data.write(html) print('\n' + results) ''' for sent in doc.sents: print(sent) for tok in doc: if tok.is_stop == False: print([tok.text, tok.pos_, tok.tag_]) ''' # TEST ----------------------------- # end program print('Done.')
def main(): ''' NERS Demo w/ Sample Data ''' # CONFIG ---------------------- \\ # -------------------------------- \\ model = 'post' # pre -> use non-trained model / post -> use trained model ruler = 'on' cleaner = 'on' number_tagger = 'off' # if stemmer is turned on after model does P2 training, then # you will need to use POS tag to detect nouns in products # then create new generator patterns for all.json # then run entity ruler again stemmer = 'off' patterns_file = 'nu_demo_mmat.jsonl' tender_file = 'demo_ners_descriptions_nonstock_test.csv' # iesa descriptions output_file = 'demo_ners_output_nonstock.txt' write_type = 'w' # -------------------------------- // # ------------------------------ // # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner' ]) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('model_entRuler') nlp.add_pipe(sentence_segmenter, after='tagger') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler) #, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler) #, before='ner') # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # CONSOLE OUTPUT print('\n') labels = ['MMAT'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Mmat'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('model_entRuler') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- mmats = [] # print(doc) print('--------------------------') #for sent in doc.sents: print(sent) print('nu----------------') #for sent in doc.sents: # print(sent) with open(output_file, write_type) as outfile: s = '' prev_label = 'WRWX' for ent in doc.ents: if ent.label_ in ['MMAT', 'WRWX']: if ent.label_ == 'WRWX': if prev_label == 'WRWX': print('.') outfile.write('.\n') else: # ie prev_label == 'MANUF' #print('\n') outfile.write('\n') prev_label = 'WRWX' if ent.label_ == 'MMAT': # write to manufs[] mmats.append([ent.text]) s = ent.text if prev_label == 'WRWX': # write to console print(s.upper()) # write to outfile outfile.write(s.upper()) prev_label = 'MMAT' elif prev_label == 'MMAT': pass # don't write again # DISPLACY VISUALIZER # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format( item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'IESA Named Entities Found in Tender\n' doc = nlp(header + spacer + results + spacer + tender) html = displacy.render(doc, style="ent", page=True) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/Users/stacy/My Localhost/index.html', 'w') as data: data.write(html) print('\n' + results) # end program print('Done.')
def main(): ''' NERS Demo IESA Sample Data ''' # CONFIG ------------------------ model = 'pre' # pre -> use non-trained model / post -> use trained model ruler = 'on' cleaner = 'on' number_tagger = 'off' # if stemmer is turned on after model does P2 training, then # you will need to use POS tag to detect nouns in products # then create new generator patterns for all.json # then run entity ruler again stemmer = 'off' patterns_file = 'demo_ners_patterns_mmat.jsonl' tender_file = 'demo_short_descriptions_mmat.txt' # 'iesa_long_descriptions_39468.csv' output_file = 'demo_shortd_mmat_output.txt' write_type = 'w' # -------------------------------- # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser' ]) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('model_entRuler') nlp.add_pipe(sentence_segmenter, before='ner') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler, before='ner') # write tagger into pipeline in the meta json file # STOPPED HERE ------------------------ # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean #print(tender) doc = nlp(tender) # CONSOLE OUTPUT print('\n') labels = ['MMAT'] # , 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['mMat'] # , 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('model_entRuler') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- # This technique allows you to isolate entities on # a sentence-by-sentence basis, which will allow # for matching entities on a record-by-record basis ent_exists = False for sent in doc.sents: for ent in sent.ents: if ent.label_ == 'MMAT': print(ent.label_, ': ', ent.text) ent_exists = True if ent_exists == False: print('None') print(sent) ent_exists = False print('\n') # ------------------------------------- mmats = [] with open(output_file, write_type) as outfile: s = '' prev_label = 'AJAX' for ent in doc.ents: if ent.label_ in ['MMAT', 'AJAX']: if ent.label_ == 'AJAX': if prev_label == 'AJAX': print('.') outfile.write('.\n') else: # ie prev_label == 'MMAT' #print('\n') outfile.write('\n') prev_label = 'AJAX' if ent.label_ == 'MMAT': # write to mmats[] mmats.append([ent.text]) s = ent.text if prev_label == 'AJAX': # write to console print(s.upper()) # write to outfile outfile.write(s.upper()) prev_label = 'MMAT' elif prev_label == 'MMAT': # don't write again ''' # write to console print('\t|', s.upper()) # write to outfile s = '\t|' + s outfile.write(s.upper()) prev_label = 'MMAT' with open(output_file, write_type) as outfile: s = '' prev_label = 'AJAX' for ent in doc.ents: if ent.label_ in ['SUPPLIER', 'AJAX']: if ent.label_ == 'AJAX': if prev_label == 'AJAX': print('.') outfile.write('.\n') else: # ie prev_label == 'SUPPLIER' #print('\n') outfile.write('\n') prev_label = 'AJAX' if ent.label_ == 'SUPPLIER': # write to suppliers[] suppliers.append([ent.text]) s = ent.text if prev_label == 'AJAX': # write to console print(s.upper()) # write to outfile outfile.write(s.upper()) prev_label = 'SUPPLIER' elif prev_label == 'SUPPLIER': # write to console print('\t|', s.upper()) # write to outfile s = '\t|' + s outfile.write(s.upper()) prev_label = 'SUPPLIER' elif ent.label_ == 'PRODUCT': products.append([ent.label_, ent.text]) elif ent.label_ == 'SKU': skus.append([ent.label_, ent.text]) elif ent.label_ == 'MPN': mpns.append([ent.label_, ent.text]) print('--------------------------') for i in suppliers: print(i) print('--------------------------') for i in products: print(i) print('--------------------------') for i in mpns: print(i) print('--------------------------') for i in skus: print(i) ''' # DISPLACY VISUALIZER # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format( item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'Named Entities Resolution for Tender Data\n' doc = nlp(header + spacer + results + spacer + tender) html = displacy.render(doc, style="ent", page=True) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/xampp/htdocs/mySites/wrWx_NERS/index.html', 'w') as data: data.write(html) print('\n' + results) ''' for sent in doc.sents: print(sent) for tok in doc: if tok.is_stop == False: print([tok.text, tok.pos_, tok.tag_]) ''' # TEST ----------------------------- # end program print('Done.')
def main(): global row_heads nlp = spacy.load('model') # load the pre-trained model with entity ruler # add pipe components nlp.add_pipe(sentence_segmenter, before='entity_ruler') #tagger = nlp.create_pipe("tagger") #tagger = Tagger(nlp.vocab) #nlp.add_pipe(tagger, after='sentence_segmenter') # initialize Matcher with a vocab # rem Matcher must share vocab with documents # it operates upon matcher = Matcher(nlp.vocab) # show pipeline components: print(nlp.pipe_names) # call matcher.add with no callback and one custom pattern # (if you use a callback, it is invoked on a successful match) # rem each dictionary represents one token pattern_1 = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] pattern_2 = [{"LOWER": "hello"}, {"LOWER": "world"}] matcher.add("HelloWorld", None, pattern_1, pattern_2) doc = nlp(u"Hello, world! Hello! Hello world! Hello, there, world!") matches = matcher(doc) for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] # Get string representation span = doc[start:end] # The matched span print(match_id, string_id, start, end, span.text) tender = nlp( string_cleaner.clean_doc(loader.load_doc('matcher/mini_tender.csv'))) # show row_heads print('Tender row_heads: ', row_heads) erp = nlp(string_cleaner.clean_doc( loader.load_doc('matcher/mini_erp.csv'))) # show row_heads print('ERP row_heads: ', row_heads) print('\nERP: -------------------\\') print(erp) print('\nTENDER: -------------------\\') print(tender) print('\nERP SENTS: -------------------') for sent in erp.sents: print(sent.text, end='') print('\nTENDER SENTS: -------------------') for sent in tender.sents: print(sent.text, end='') print('\nTENDER ENTS: -------------------') for sent in tender.sents: for ent in sent.ents: print(ent.text, ent.label_) print('\nTENDER TOKENS: -------------------') #tokens = [token.text for token in doc if not token.is_stop] for sent in tender.sents: for token in sent: if not token.is_stop: print(token.text, token.ent_type_) # by default, the matcher only returns matches and nothing else. # if you want it to merge entities, assign labels, or something else, # then you can define such actions for each pattern by passing in a # callback function as the on_match argument on add(), ie: # matcher.add("StringID", myCallBack, pattern) # end program print('\n\nDone.')
def main(): ''' NERS Demo IESA Sample Data ''' # CONFIG ------------------------ model = 'pre' # pre -> use non-trained model / post -> use trained model ruler = 'on' cleaner = 'on' number_tagger = 'off' # if stemmer is turned on after model does P2 training, then # you will need to use POS tag to detect nouns in products # then create new generator patterns for all.json # then run entity ruler again stemmer = 'off' patterns_file = 'iesa_ners_patterns_supplier.jsonl' tender_file = 'iesa_short_descriptions_39468.csv' # -------------------------------- # load model if model == 'pre': # load a language and invoke the entity ruler nlp = spacy.load('en_core_web_sm', disable=['parser']) #('en_core_web_sm', disable=['parser']) elif model == 'post': nlp = spacy.load('model_entRuler') #nu_pipe = nlp.create_pipe('my_pipe') #tagger = Tagger(nlp.vocab) #tagger = tagger.from_disk('model_entRuler') #nlp.add_pipe(nlp.create_pipe(tagger)) #Language.factories['entRuler_tagger'] = lambda nlp, **cfg: EntityMatcher(nlp, **cfg) #entRuler_tagger = Tagger(nlp.vocab) #entRuler_tagger = entRuler_tagger.from_disk('model_entRuler/tagger') #nlp.add_pipe(entRuler_tagger, first = True) #tagger = nlp.create_pipe('tagger') # add pipes if ruler == 'on': # rem if model is post then the entity ruler is already in the model if model == 'pre': # load patterns from external file only if model is not already trained nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ruler before ner will override ner decisions in favor of ruler patterns nlp.add_pipe(nu_ruler, before='ner') # remember to swap precedence between ruler and ner after model training if model == 'post': # load patterns from external file only if model is not already trained if "entity_ruler" not in nlp.pipe_names: nu_ruler = EntityRuler(nlp).from_disk(patterns_file) # putting the ner before ruler will override favor ner decisions nlp.add_pipe(nu_ruler, after='ner') # write tagger into pipeline in the meta json file # STOPPED HERE ------------------------ # show pipeline components: print(nlp.pipe_names) # import test tender and clean it up tender = import_csv(tender_file) # import if cleaner == 'on': tender = string_cleaner.clean_doc(tender) # clean doc = nlp(tender) # CONSOLE OUTPUT print('\n') labels = ['SUPPLIER', 'PRODUCT', 'MPN', 'SKU'] alt_labels = ['Supplier', 'Product', 'MfrPartNo', 'SkuID'] total_found = [] total_unique_found = [] for label in labels: print('Results for {} --------------'.format(label)) tot_num = 0 unique_num = 0 unique = [] for ent in doc.ents: if ent.label_ == label: if ent.text not in unique: unique.append(ent.text) unique_num += 1 print([ent.text, ent.label_], end='') tot_num += 1 print('\nFound {} total, {} unique.\n'.format(tot_num, unique_num)) total_found.append(tot_num) total_unique_found.append(unique_num) # save model with entity pattern updates made by the entity ruler if ruler == "on": output_dir = Path('model_entRuler') if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # TEST ----------------------------- suppliers = [] products = [] skus = [] mpns = [] # print(doc) for ent in doc.ents: if ent.label_ in labels: if ent.label_ == 'SUPPLIER': suppliers.append([ent.label_, ent.text]) elif ent.label_ == 'PRODUCT': products.append([ent.label_, ent.text]) elif ent.label_ == 'SKU': skus.append([ent.label_, ent.text]) elif ent.label_ == 'MPN': mpns.append([ent.label_, ent.text]) print('--------------------------') for i in suppliers: print(i) print('--------------------------') for i in products: print(i) print('--------------------------') for i in mpns: print(i) print('--------------------------') for i in skus: print(i) # DISPLACY VISUALIZER # get results for html doc results = '' i = 0 for item in alt_labels: results = results + '{}: {} tot {} unq\n'.format(item, total_found[i], total_unique_found[i]) i += 1 # store nlp object as string in html var spacer = '---------------------------------------------------------\n' header = 'IESA Named Entities Found in Tender\n' doc = nlp(header + spacer + results + spacer + tender) html = displacy.render(doc, style="ent", page=True) # use the entity visualizer # write the html string to the xampp folder and launch in browser through localhost port with open('C:/xampp/htdocs/mySites/wrWx_NERS/index.html', 'w') as data: data.write(html) print('\n' + results) ''' for sent in doc.sents: print(sent) for tok in doc: if tok.is_stop == False: print([tok.text, tok.pos_, tok.tag_]) ''' # TEST ----------------------------- # end program print('Done.')