Python Tokenizer.gen_n_grams 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tokenizer

클래스/타입: Tokenizer

메소드/함수: gen_n_grams

hotexamples.com에서의 예제들: 2

Python Tokenizer.gen_n_grams - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tokenizer.Tokenizer.gen_n_grams에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Tokenizer(30)

encode(11)

decode(7)

get_next_token(7)

all_tokens(7)

_pos(5)

advance(5)

filter_tokens(4)

fit(4)

batch_encode(4)

discovery_dir(4)

close(3)

curr_token(3)

eat(3)

LoadStrategy(3)

getTokens(3)

__init__(3)

fit_on_texts(3)

from_pretrained(3)

build_vocab(3)

fit_in_parallel(2)

get_baseforms(2)

en_vocab_create(2)

clean_text(2)

process_review(2)

gen_n_grams(2)

getNextToken(2)

tokenized_url(2)

add(2)

getSentences(1)

get_inlined_exception_name(1)

get_chunks(1)

get_blocks(1)

_Tokenizer__next_char(1)

_Tokenizer__unread_char(1)

getToken(1)

getTTL(1)

changeId(1)

get_n_gram_count(1)

getLocations(1)

getLastToken(1)

getJson(1)

getFinal(1)

gentokenize(1)

genclasstokenize(1)

add_consumer(1)

get_inlined_right_value(1)

Tokenize(1)

add_format(1)

print_all(1)

예제 #1

파일 보기

파일: extract_bi_grams.py 프로젝트: jperelshteyn/tr_challenge

def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        ("se" ,"select ID, story from swift_excel_articles;"),
        ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", '''select sw.ID, sw.story 
                from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID 
                where se.swiftID is null and in_sample=1;'''),
        ("ns", '''select ns.ID, concat(ns.lead_parag, ' ', ns.body) 
                from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID 
                where ne.nscID is null and in_sample=1;''')
    ]

    token_ids = get_token_dict(db)
    bi_gram_freqs = defaultdict(int)
    
    t = time.time()
    st = time.time()
    row_count = 0
    
    for source_id, query in select_strs:

        print query

        cnx = get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 10000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if row_count % 10000 == 0:
                delete_low_counts(bi_gram_freqs)
                
            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                counted = set()
                
                for bi_gram in bi_gram_gen:
                    
                    bg_id_tup = (token_ids[bi_gram[0]], token_ids[bi_gram[1]])
                    
                    if bg_id_tup not in counted:
                        bi_gram_freqs[bg_id_tup] += 1
                        counted.add(bg_id_tup)

            if len(bi_gram_freqs) > 1000000:
                pickle_bi_gram_freqs(bi_gram_freqs, source_id)
                bi_gram_freqs = defaultdict(int)                
                        
        pickle_bi_gram_freqs(bi_gram_freqs, source_id)
        bi_gram_freqs = defaultdict(int)
        select_cur.close()
        cnx.close()      

    print 'parsing time: ', (time.time() - st) / 60, 'minutes'
    return bi_gram_freqs

예제 #2

파일 보기

파일: process_grams.py 프로젝트: jperelshteyn/tr_challenge

def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        # ("se" ,"select ID, story from swift_excel_articles;"),
        # ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", "select sw.ID, sw.story from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID where se.swiftID is null;"),
        ("ns", "select ns.ID, concat(ns.lead_parag, ' ', ns.body) from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID where ne.nscID  is null;")
    ]
    
    id_gen = gen_ids()

    token_ids = get_token_dict(db)

    # token_freqs = dict()
    bi_gram_freqs = dict()
    
    doc_counts = list()
    
    t = time.time()

    row_count = 0
    cnx =  get_db_context()
    
    def incr_token_dict(t_dict, token):
        if token in t_dict:
            t_dict[token]['c'] += 1
        else:
            t_dict[token] = {'i': next(id_gen), 't': token, 'c': 1}

    
    def incr_bi_gram_dict(bg_dict, bg, source_id=None):
        if bg in bg_dict:
            bg_dict[bg]['c'] += 1
        else:
            if source_id:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1, 'source_id': source_id}
            else:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1}

    
    for source_id, query in select_strs:

        print query

        cnx =  get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 5000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                # token_gen = tokenizer.gen_tokens()
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                
                # token_doc_freqs = dict()
                bi_gram_doc_freqs = dict()
                
                # for token in token_gen:
                    
                #     incr_token_dict(token_freqs, token)
                #     incr_token_dict(token_doc_freqs, token)

                for bi_gram in bi_gram_gen:
                    
                    gram1_id = token_ids[bi_gram[0]]
                    gram2_id = token_ids[bi_gram[1]]

                    incr_bi_gram_dict(bi_gram_doc_freqs, (gram1_id, gram2_id))
                    incr_bi_gram_dict(bi_gram_freqs, (gram1_id, gram2_id), source_id)
                    
                doc_counts.append(
                {
                        'sql_id': article_id,
                        'sql_tbl_id': source_id,
                        # 't_counts': token_doc_freqs.values(),
                        'bg_counts': bi_gram_doc_freqs.values()
                    }
                )
                
                if len(doc_counts) > 1000:
                    db_t = time.time()
                    print 'updating db...'
                    db.doc_freq.insert_many(doc_counts)
                    doc_counts = list()
                    
#                         update_doc_bg_counts(db, doc_counts)
#                         doc_counts = list()        

                    insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
                    bi_gram_freqs = dict()

                    print 'db done in', (time.time() - db_t)/60, 'minutes'
                    
        # refresh_token_freqs(db, token_freqs)
        insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
        bi_gram_freqs = dict()

        select_cur.close()
        cnx.close()      

    print 'parsing time: ', time.time() - t
    t = time.time()
    print 'updating...'
    
    db.doc_freq.insert_many(doc_counts)
#         update_doc_bg_counts(db, doc_counts)
    insert_bi_gram_freqs(db, source_id, bi_gram_freqs)