def main(): global s1 global s2 # get english language model # and remove the dependency-parcing pipeline nlp = spacy.load('en_core_web_sm', disable=['parser']) # add custom pipe components to create the following pipeline: # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger # consider adding: entity_ruler, merge_noun_chunks # https://spacy.io/usage/processing-pipelines/ nlp.add_pipe(custom_sentencizer, before="ner") # Insert before the parser nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True) print(nlp.pipe_names) # add the sentencizer component to the pipeline # rem this component splits sentences on punctuation such as . ! ? # plugging it into pipeline to get just the sentence boundaries # without the dependency parse. #sentencizer = nlp.create_pipe("sentencizer") #nlp.add_pipe(sentencizer) st1 = preprocessor.string_cleaner(s1) st2 = preprocessor.string_cleaner(s2) print(st1) print('\n\n') row1 = nlp(st1) row2 = nlp(st2) print(row1.text) # print sentence segmentation: print('\nshow sentence segmentation\n') for sent in row1.sents: print(sent.text) # print token attributes: print('\nshow token attributes:\n') for token in row1: print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop) # print entity attributes print('\nshow entity attributes:\n') for ent in row1.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_)
def main(): # get english language model nlp = spacy.load('en_core_web_sm') #, disable=['parser']) # want to add custom pipe components to create the following pipeline: # tokenizer -> custom_colname_tagger -> common_key_tagger # custom_sentencizer -> ner -> # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/) nlp.add_pipe(custom_sentencizer, before="tagger") # Insert before the parser #nlp.add_pipe(field_val_tagger, name = "field_val_tagger", last=True) nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True) print(nlp.pipe_names) # test print # get product group data file txt_obj = '' with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') i = 0 header = [] global product_ids for row in data: if i == 0: header.append(row) else: # rem string.cleaner normalized all text in text obj # so all metadata must also be normalized to enable # matching product_id = row[0].lower() product_ids.append(product_id) # create text object with '.' at end if i != 0: txt_obj = txt_obj + ' '.join(row) + '.\n' i += 1 # print('\n\n# of sents: ', i) # test print print('\n\ncontents of header and product_ids[]:\n') print(header) print(product_ids) # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) # create NLP object ---------------------- # print('\n\n', txt_obj) # test print nlp_obj = nlp(txt_obj) print('\n\n') # test print i = 0 for sent in nlp_obj.sents: print(sent.text, '**end**') i += 1 #-------------------------------------- # testing: setting entity annotations #-------------------------------------- ''' # find token positions print('\n\nfind token positions:\n') for token in nlp_obj: print(token.text, token.i) PRODUCT = nlp_obj.vocab.strings[u"PRODUCT"] # get hash value of entity label igp = Span(nlp_obj, 1, 4, label=PRODUCT) # create a Span for the new entity nlp_obj.ents = list(nlp_obj.ents) + [igp] ''' # STOPPED HERE PRODUCTID = nlp_obj.vocab.strings[u"PRODUCTID"] for token in nlp_obj: i = 0 for id in product_ids: if token.text == id: start = token.i end = token.nbor().i print('token: {}, token start: {}, token end: {}'.format( token.text, start, end)) pid = Span(nlp_obj, start, end, label=PRODUCTID) nlp_obj.ents = list(nlp_obj.ents) + [pid] i += 1 #print('found{} product ids.\n'.format(found)) print('\nAfter') ents = [(e.text, e.start_char, e.end_char, e.label_) for e in nlp_obj.ents] for e in nlp_obj.ents: print(e.text, e.label_) print('\n\n') '''
def main(): # get just the language with no model nlp = English() # nlp = spacy.load('en_core_web_sm') # add the sentencizer component to the pipeline # rem this component splits sentences on punctuation such as . ! ? # plugging it into pipeline to get just the sentence boundaries # without the dependency parse. sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) ''' Model for component 'ner' not initialized. Did you forget to load a model, or forget to call begin_training()? ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ''' # get product group data file and feed the info into arrays # that will be used later to create custom tags for the nlp object txt_obj = '' with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') headers = [] productIDs = [] products = [] suppliers = [] mpns = [] # TEST print ----------------------- #print('contents of arrays for tagging:\n') testList = [headers, productIDs, products, suppliers, mpns] i = 0 for row in data: if i == 0: headers.append(row) else: productID = row[0] product = row[1] supplier = row[2] mpn = row[3] productIDs.append(productID) products.append(product) suppliers.append(supplier) mpns.append(mpn) # create text object # rem add a period at the end so that the spacy sentencizer # knows how to detect the end of each record # and add all rows to text object except for header row if i != 0: txt_obj = txt_obj + ' '.join(row) + '.\n' i += 1 # TEST print ----------------------- print('testList items:\n') for item in testList: print(item) # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) # TEST PRINT ----------------------- print('\n\ntxt_obj after cleaning:\n', txt_obj) # create the nlp object: pumps_erp10 = nlp(txt_obj) # TEST print ----------------------- print('\n\npumps_erp10 after sentencizer:\n') for sent in pumps_erp10.sents: print(sent.text, '**end row**', end='') # TEST print ----------------------- print('\n\ntoken.like_num in nlp obj:\n') for token in pumps_erp10: print(token.like_num, ',', end='') # stuff we get: # token, .text, .i, .idx, .tag_, .lemma_ # .is_punct, .is_space, .like_num print('\nDone.') # stuff we don't get: # pos, ent, chunking, # LU # textcat (TextCategorizer, Doc.cats) # custom components (Doc._.xxx, Token._.xxx, Span._.xxx) # create_pipe, add_pipe # TEST print ----------------------- print('\n', nlp.pipeline) print('\n', nlp.pipe_names)
def main(): # get english language model # and remove the dependency-parcing pipeline nlp = spacy.load('en_core_web_sm', disable=['parser']) # want to add custom pipe components to create the following pipeline: # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/) nlp.add_pipe(custom_sentencizer, before="ner") # Insert before the parser nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True) print(nlp.pipe_names) # get product group data file txt_obj = '' with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') i = 0 for row in data: # create text object with '.' at end if i != 0: txt_obj = txt_obj + ' '.join(row) + '.\n' i += 1 print('\n\n', txt_obj) nlp_obj = nlp(txt_obj) print('\n\n') i = 0 for sent in nlp_obj.sents: print(sent.text, '**end**') i += 1 print('\n\n# of sents: ', i) # observations: # if you don't clean the txt_obj, spacy sees all records as one sentence # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) # TEST PRINT ----------------------- print('\n\ntxt_obj after cleaning:\n', txt_obj) # create the nlp object: nlp_obj_cln = nlp(txt_obj) print('\n\ntxt_obj_cln:\n') # TEST print ----------------------- i = 0 for sent in nlp_obj_cln.sents: print(sent.text, '**end**') i += 1 print('\n\n# of sents: ', i) #-------------------------------------- # testing: setting entity annotations #-------------------------------------- ents = [(e.text, e.start_char, e.end_char, e.label_) for e in nlp_obj_cln.ents] print('\nBefore', ents) # the model didn't recognise "FB" as an entity :( # find token positions print('\n\nfind token positions:\n') for token in nlp_obj_cln: print(token.text, token.i) PRODUCT = nlp_obj_cln.vocab.strings[u"PRODUCT"] # get hash value of entity label igp = Span(nlp_obj_cln, 1, 4, label=PRODUCT) # create a Span for the new entity nlp_obj_cln.ents = list(nlp_obj_cln.ents) + [igp] ents = [(e.text, e.start_char, e.end_char, e.label_) for e in nlp_obj_cln.ents] print('\nAfter', ents) for e in nlp_obj_cln.ents: if e.text == 'internal gear pump': print(e.text, ' is a ', e.label_)
def main(): nlp = spacy.load('en_core_web_sm') # import csv -- first pass # get product group data file and feed the info into arrays # that will be used later to create custom tags for the nlp object with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') headers = [] productIDs = [] products = [] suppliers = [] mpns = [] # TEST list testList = [headers, productIDs, products, suppliers, mpns] i = 0 for row in data: if i == 0: headers.append(row) else: productID = row[0] product = row[1] supplier = row[2] mpn = row[3] productIDs.append(productID) products.append(product) suppliers.append(supplier) mpns.append(mpn) i += 1 # TEST print for item in testList: print(item) # import csv -- first pass # get product group data file and feed the info into a text object csv_path = '../store/model/erp10/pumps/prod_pumps_erp10.csv' txt_obj = '.' with open(csv_path) as data: reader = csv.reader(data, delimiter=' ', quotechar=' ', quoting=csv.QUOTE_MINIMAL) for row in reader: txt_obj = txt_obj + '\'' + ' '.join(row) + '\'' + '\n' # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) print(txt_obj) # TEST # write txt_obj to txt file txt_path = '../store/model/erp10/pumps/prod_pumps_erp10.txt' with open(txt_path, 'wt') as outfile: for line in txt_obj: outfile.writelines(line) print('', end='', flush=True) # rem flush the output buffer # read file back into text object txt_obj = '' txt_obj = loader.import_txt(txt_path) #txt_obj = preprocessor.remove_whitespace(txt_obj) print(txt_obj) # TEST # create the nlp object: pumps_erp10 = nlp(txt_obj) for sent in pumps_erp10.sents: print(sent.text) print('Done.')
def main(): # get english language model # and remove the dependency-parcing pipeline nlp = spacy.load('en_core_web_sm')#, disable=['parser']) # want to add custom pipe components to create the following pipeline: # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/) nlp.add_pipe(custom_sentencizer, before="tagger") # Insert before the parser nlp.add_pipe(field_val_tagger, name = "field_val_tagger", last=True) nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True) print(nlp.pipe_names) # test print # get product group data file txt_obj = '' with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') i = 0 header = [] global product_ids for row in data: if i == 0: header.append(row) else: product_id = row[0] product_ids.append(product_id) # create text object with '.' at end if i != 0: txt_obj = txt_obj + ' '.join(row) + '.\n' i += 1 # print('\n\n# of sents: ', i) # test print print('\n\ncontents of header and product_ids[]:\n') print(header) print(product_ids) # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) # create NLP object ---------------------- # print('\n\n', txt_obj) # test print nlp_obj = nlp(txt_obj) print('\n\n') # test print i = 0 for sent in nlp_obj.sents: print(sent.text, '**end**') i += 1 # observations: # if you don't clean the txt_obj, spacy sees all records as one sentence # TEST PRINT ----------------------- # print('\n\ntxt_obj after cleaning:\n', txt_obj) # test print # create the nlp object: print('\n\ntxt_obj:') i = 0 for sent in nlp_obj.sents: print(sent.text, '**end**\n\n') i += 1 # TEST print text attributes ----------------------- '''