Python string_cleaner示例，preprocessor.string_cleaner Python示例

示例#1

0

显示文件

文件： sentencizer_short_test.py 项目： stcybrdgs/wxMatchingEngine

def main():
    global s1
    global s2

    # get english language model
    # and remove the dependency-parcing pipeline
    nlp = spacy.load('en_core_web_sm', disable=['parser'])

    # add custom pipe components to create the following pipeline:
    # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger
    # consider adding: entity_ruler, merge_noun_chunks
    # https://spacy.io/usage/processing-pipelines/
    nlp.add_pipe(custom_sentencizer, before="ner")  # Insert before the parser
    nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True)

    print(nlp.pipe_names)

    # add the sentencizer component to the pipeline
    # rem this component  splits sentences on punctuation such as . !  ?
    # plugging it into pipeline to get just the sentence boundaries
    # without the dependency parse.
    #sentencizer = nlp.create_pipe("sentencizer")
    #nlp.add_pipe(sentencizer)

    st1 = preprocessor.string_cleaner(s1)
    st2 = preprocessor.string_cleaner(s2)

    print(st1)
    print('\n\n')

    row1 = nlp(st1)
    row2 = nlp(st2)
    print(row1.text)

    # print sentence segmentation:
    print('\nshow sentence segmentation\n')
    for sent in row1.sents:
        print(sent.text)

    # print token attributes:
    print('\nshow token attributes:\n')
    for token in row1:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
              token.shape_, token.is_alpha, token.is_stop)

    # print entity attributes
    print('\nshow entity attributes:\n')
    for ent in row1.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

示例#2

0

显示文件

def main():
    # get english language model
    nlp = spacy.load('en_core_web_sm')  #, disable=['parser'])

    # want to add custom pipe components to create the following pipeline:
    # tokenizer -> custom_colname_tagger -> common_key_tagger
    # custom_sentencizer -> ner ->
    # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/)
    nlp.add_pipe(custom_sentencizer,
                 before="tagger")  # Insert before the parser
    #nlp.add_pipe(field_val_tagger, name = "field_val_tagger", last=True)
    nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True)

    print(nlp.pipe_names)  # test print

    # get product group data file
    txt_obj = ''
    with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        i = 0
        header = []
        global product_ids
        for row in data:
            if i == 0:
                header.append(row)
            else:
                # rem string.cleaner normalized all text in text obj
                # so all metadata must also be normalized to enable
                # matching
                product_id = row[0].lower()
                product_ids.append(product_id)
            # create text object with '.' at end
            if i != 0:
                txt_obj = txt_obj + ' '.join(row) + '.\n'
            i += 1

    # print('\n\n# of sents: ', i)  # test print
    print('\n\ncontents of header and product_ids[]:\n')
    print(header)
    print(product_ids)

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)

    # create NLP object  ----------------------
    # print('\n\n', txt_obj)  # test print
    nlp_obj = nlp(txt_obj)
    print('\n\n')  # test print
    i = 0
    for sent in nlp_obj.sents:
        print(sent.text, '**end**')
        i += 1

    #--------------------------------------
    # testing: setting entity annotations
    #--------------------------------------
    '''
    # find token positions
    print('\n\nfind token positions:\n')
    for token in nlp_obj:
        print(token.text, token.i)

    PRODUCT = nlp_obj.vocab.strings[u"PRODUCT"]
    # get hash value of entity label
    igp = Span(nlp_obj, 1, 4, label=PRODUCT) # create a Span for the new entity
    nlp_obj.ents = list(nlp_obj.ents) + [igp]
    '''

    # STOPPED HERE
    PRODUCTID = nlp_obj.vocab.strings[u"PRODUCTID"]
    for token in nlp_obj:
        i = 0
        for id in product_ids:
            if token.text == id:
                start = token.i
                end = token.nbor().i
                print('token: {}, token start: {}, token end: {}'.format(
                    token.text, start, end))

                pid = Span(nlp_obj, start, end, label=PRODUCTID)
                nlp_obj.ents = list(nlp_obj.ents) + [pid]

                i += 1

    #print('found{} product ids.\n'.format(found))

    print('\nAfter')
    ents = [(e.text, e.start_char, e.end_char, e.label_) for e in nlp_obj.ents]

    for e in nlp_obj.ents:
        print(e.text, e.label_)

    print('\n\n')
    '''

示例#3

0

显示文件

def main():
    # get just the language with no model
    nlp = English()
    # nlp = spacy.load('en_core_web_sm')

    # add the sentencizer component to the pipeline
    # rem this component  splits sentences on punctuation such as . !  ?
    # plugging it into pipeline to get just the sentence boundaries
    # without the dependency parse.
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    '''
    Model for component 'ner' not initialized.
    Did you forget to load a model, or forget to call begin_training()?
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    '''

    # get product group data file and feed the info into arrays
    # that will be used later to create custom tags for the nlp object
    txt_obj = ''
    with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        headers = []
        productIDs = []
        products = []
        suppliers = []
        mpns = []

        # TEST print    -----------------------
        #print('contents of arrays for tagging:\n')

        testList = [headers, productIDs, products, suppliers, mpns]
        i = 0
        for row in data:
            if i == 0:
                headers.append(row)
            else:
                productID = row[0]
                product = row[1]
                supplier = row[2]
                mpn = row[3]

                productIDs.append(productID)
                products.append(product)
                suppliers.append(supplier)
                mpns.append(mpn)

            # create text object
            # rem add a period at the end so that the spacy sentencizer
            # knows how to detect the end of each record
            # and add all rows to text object except for header row
            if i != 0:
                txt_obj = txt_obj + ' '.join(row) + '.\n'
            i += 1

    # TEST print  -----------------------
    print('testList items:\n')
    for item in testList:
        print(item)

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)

    # TEST PRINT  -----------------------
    print('\n\ntxt_obj after cleaning:\n', txt_obj)

    # create the nlp object:
    pumps_erp10 = nlp(txt_obj)

    # TEST print  -----------------------
    print('\n\npumps_erp10 after sentencizer:\n')
    for sent in pumps_erp10.sents:
        print(sent.text, '**end row**', end='')

    # TEST print  -----------------------
    print('\n\ntoken.like_num in nlp obj:\n')
    for token in pumps_erp10:
        print(token.like_num, ',', end='')

    # stuff we get:
    # token, .text, .i, .idx, .tag_, .lemma_
    # .is_punct, .is_space, .like_num
    print('\nDone.')

    # stuff we don't get:
    # pos, ent, chunking,

    # LU
    # textcat (TextCategorizer, Doc.cats)
    # custom components (Doc._.xxx, Token._.xxx, Span._.xxx)
    # create_pipe, add_pipe

    # TEST print  -----------------------
    print('\n', nlp.pipeline)
    print('\n', nlp.pipe_names)

示例#4

0

显示文件

文件： 0_setEntAnnotations.py 项目： stcybrdgs/wxMatchingEngine

def main():
    # get english language model
    # and remove the dependency-parcing pipeline
    nlp = spacy.load('en_core_web_sm', disable=['parser'])

    # want to add custom pipe components to create the following pipeline:
    # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger
    # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/)
    nlp.add_pipe(custom_sentencizer, before="ner")  # Insert before the parser
    nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True)
    print(nlp.pipe_names)

    # get product group data file
    txt_obj = ''
    with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        i = 0
        for row in data:
            # create text object with '.' at end
            if i != 0:
                txt_obj = txt_obj + ' '.join(row) + '.\n'
            i += 1

    print('\n\n', txt_obj)
    nlp_obj = nlp(txt_obj)
    print('\n\n')
    i = 0
    for sent in nlp_obj.sents:
        print(sent.text, '**end**')
        i += 1
    print('\n\n# of sents: ', i)

    # observations:
    # if you don't clean the txt_obj, spacy sees all records as one sentence

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)

    # TEST PRINT  -----------------------
    print('\n\ntxt_obj after cleaning:\n', txt_obj)

    # create the nlp object:
    nlp_obj_cln = nlp(txt_obj)
    print('\n\ntxt_obj_cln:\n')

    # TEST print  -----------------------
    i = 0
    for sent in nlp_obj_cln.sents:
        print(sent.text, '**end**')
        i += 1
    print('\n\n# of sents: ', i)

    #--------------------------------------
    # testing: setting entity annotations
    #--------------------------------------
    ents = [(e.text, e.start_char, e.end_char, e.label_)
            for e in nlp_obj_cln.ents]
    print('\nBefore', ents)
    # the model didn't recognise "FB" as an entity :(

    # find token positions
    print('\n\nfind token positions:\n')
    for token in nlp_obj_cln:
        print(token.text, token.i)

    PRODUCT = nlp_obj_cln.vocab.strings[u"PRODUCT"]
    # get hash value of entity label
    igp = Span(nlp_obj_cln, 1, 4,
               label=PRODUCT)  # create a Span for the new entity
    nlp_obj_cln.ents = list(nlp_obj_cln.ents) + [igp]

    ents = [(e.text, e.start_char, e.end_char, e.label_)
            for e in nlp_obj_cln.ents]
    print('\nAfter', ents)

    for e in nlp_obj_cln.ents:
        if e.text == 'internal gear pump':
            print(e.text, ' is a ', e.label_)

示例#5

0

显示文件

文件： test_makeProdGrpModel.py 项目： stcybrdgs/wxMatchingEngine

def main():
    nlp = spacy.load('en_core_web_sm')

    # import csv -- first pass
    # get product group data file and feed the info into arrays
    # that will be used later to create custom tags for the nlp object
    with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        headers = []
        productIDs = []
        products = []
        suppliers = []
        mpns = []

        # TEST list
        testList = [headers, productIDs, products, suppliers, mpns]
        i = 0
        for row in data:
            if i == 0:
                headers.append(row)
            else:
                productID = row[0]
                product = row[1]
                supplier = row[2]
                mpn = row[3]

                productIDs.append(productID)
                products.append(product)
                suppliers.append(supplier)
                mpns.append(mpn)
            i += 1

    # TEST print
    for item in testList:
        print(item)

    # import csv -- first pass
    # get product group data file and feed the info into a text object
    csv_path = '../store/model/erp10/pumps/prod_pumps_erp10.csv'
    txt_obj = '.'
    with open(csv_path) as data:
        reader = csv.reader(data,
                            delimiter=' ',
                            quotechar=' ',
                            quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            txt_obj = txt_obj + '\'' + ' '.join(row) + '\'' + '\n'

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)
    print(txt_obj)  # TEST

    # write txt_obj to txt file
    txt_path = '../store/model/erp10/pumps/prod_pumps_erp10.txt'
    with open(txt_path, 'wt') as outfile:
        for line in txt_obj:
            outfile.writelines(line)
            print('', end='', flush=True)  # rem flush the output buffer

    # read file back into text object
    txt_obj = ''
    txt_obj = loader.import_txt(txt_path)
    #txt_obj = preprocessor.remove_whitespace(txt_obj)

    print(txt_obj)  # TEST

    # create the nlp object:
    pumps_erp10 = nlp(txt_obj)

    for sent in pumps_erp10.sents:
        print(sent.text)

    print('Done.')

示例#6

0

显示文件

def main():
    # get english language model
    # and remove the dependency-parcing pipeline
    nlp = spacy.load('en_core_web_sm')#, disable=['parser'])

    # want to add custom pipe components to create the following pipeline:
    # tokenizer -> tagger -> custom_sentencizer -> ner -> common_key_tagger
    # consider adding: entity_ruler, merge_noun_chunks (https://spacy.io/usage/processing-pipelines/)
    nlp.add_pipe(custom_sentencizer, before="tagger")  # Insert before the parser
    nlp.add_pipe(field_val_tagger, name = "field_val_tagger", last=True)
    nlp.add_pipe(common_key_tagger, name="common_key_tagger", last=True)

    print(nlp.pipe_names)  # test print

    # get product group data file
    txt_obj = ''
    with open('../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        i = 0
        header = []
        global product_ids
        for row in data:
            if i == 0:
                header.append(row)
            else:
                product_id = row[0]
                product_ids.append(product_id)
            # create text object with '.' at end
            if i != 0:
                txt_obj = txt_obj + ' '.join(row) + '.\n'
            i += 1

    # print('\n\n# of sents: ', i)  # test print
    print('\n\ncontents of header and product_ids[]:\n')
    print(header)
    print(product_ids)

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)

    # create NLP object  ----------------------
    # print('\n\n', txt_obj)  # test print
    nlp_obj = nlp(txt_obj)
    print('\n\n')  # test print
    i = 0
    for sent in nlp_obj.sents:
        print(sent.text, '**end**')
        i += 1


    # observations:
    # if you don't clean the txt_obj, spacy sees all records as one sentence

    # TEST PRINT  -----------------------
    # print('\n\ntxt_obj after cleaning:\n', txt_obj)  # test print

    # create the nlp object:
    print('\n\ntxt_obj:')
    i = 0
    for sent in nlp_obj.sents:
        print(sent.text, '**end**\n\n')
        i += 1

    # TEST print text attributes  -----------------------
    '''