예제 #1
0
def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        ("se" ,"select ID, story from swift_excel_articles;"),
        ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", '''select sw.ID, sw.story 
                from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID 
                where se.swiftID is null and in_sample=1;'''),
        ("ns", '''select ns.ID, concat(ns.lead_parag, ' ', ns.body) 
                from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID 
                where ne.nscID is null and in_sample=1;''')
    ]

    token_ids = get_token_dict(db)
    bi_gram_freqs = defaultdict(int)
    
    t = time.time()
    st = time.time()
    row_count = 0
    
    for source_id, query in select_strs:

        print query

        cnx = get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 10000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if row_count % 10000 == 0:
                delete_low_counts(bi_gram_freqs)
                
            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                counted = set()
                
                for bi_gram in bi_gram_gen:
                    
                    bg_id_tup = (token_ids[bi_gram[0]], token_ids[bi_gram[1]])
                    
                    if bg_id_tup not in counted:
                        bi_gram_freqs[bg_id_tup] += 1
                        counted.add(bg_id_tup)

            if len(bi_gram_freqs) > 1000000:
                pickle_bi_gram_freqs(bi_gram_freqs, source_id)
                bi_gram_freqs = defaultdict(int)                
                        
        pickle_bi_gram_freqs(bi_gram_freqs, source_id)
        bi_gram_freqs = defaultdict(int)
        select_cur.close()
        cnx.close()      

    print 'parsing time: ', (time.time() - st) / 60, 'minutes'
    return bi_gram_freqs
예제 #2
0
def extract_grams():

    client = MongoClient()
    db = client.news_tfidf

    select_strs = [
        # ("se" ,"select ID, story from swift_excel_articles;"),
        # ("ne", "select ID, body from nsc_excel_articles;"),
        ("sw", "select sw.ID, sw.story from swift_articles sw left join swift_excel_articles se on sw.ID = se.swiftID where se.swiftID is null;"),
        ("ns", "select ns.ID, concat(ns.lead_parag, ' ', ns.body) from nsc_articles ns left join nsc_excel_articles ne on ns.ID = ne.nscID where ne.nscID  is null;")
    ]
    
    id_gen = gen_ids()

    token_ids = get_token_dict(db)

    # token_freqs = dict()
    bi_gram_freqs = dict()
    
    doc_counts = list()
    
    t = time.time()

    row_count = 0
    cnx =  get_db_context()
    
    def incr_token_dict(t_dict, token):
        if token in t_dict:
            t_dict[token]['c'] += 1
        else:
            t_dict[token] = {'i': next(id_gen), 't': token, 'c': 1}

    
    def incr_bi_gram_dict(bg_dict, bg, source_id=None):
        if bg in bg_dict:
            bg_dict[bg]['c'] += 1
        else:
            if source_id:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1, 'source_id': source_id}
            else:
                bg_dict[bg] = {'g1': bg[0], 'g2': bg[1], 'c': 1}

    
    for source_id, query in select_strs:

        print query

        cnx =  get_db_context()
        select_cur = cnx.cursor()
        select_cur.execute(query)
        
        for article_id, article, in select_cur:

            row_count += 1
            if row_count % 5000 == 0:
                print 'processed', row_count, 'rows in', (time.time() - t) / 60, 'minutes'
                t = time.time()

            if type(article) in (str, unicode) and len(article) > 0:
            
                tokenizer = Tokenizer(article)
                # token_gen = tokenizer.gen_tokens()
                bi_gram_gen = tokenizer.gen_n_grams(n=2)
                
                # token_doc_freqs = dict()
                bi_gram_doc_freqs = dict()
                
                # for token in token_gen:
                    
                #     incr_token_dict(token_freqs, token)
                #     incr_token_dict(token_doc_freqs, token)

                for bi_gram in bi_gram_gen:
                    
                    gram1_id = token_ids[bi_gram[0]]
                    gram2_id = token_ids[bi_gram[1]]

                    incr_bi_gram_dict(bi_gram_doc_freqs, (gram1_id, gram2_id))
                    incr_bi_gram_dict(bi_gram_freqs, (gram1_id, gram2_id), source_id)
                    
                doc_counts.append(
                {
                        'sql_id': article_id,
                        'sql_tbl_id': source_id,
                        # 't_counts': token_doc_freqs.values(),
                        'bg_counts': bi_gram_doc_freqs.values()
                    }
                )
                
                if len(doc_counts) > 1000:
                    db_t = time.time()
                    print 'updating db...'
                    db.doc_freq.insert_many(doc_counts)
                    doc_counts = list()
                    
#                         update_doc_bg_counts(db, doc_counts)
#                         doc_counts = list()        

                    insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
                    bi_gram_freqs = dict()

                    print 'db done in', (time.time() - db_t)/60, 'minutes'
                    
        # refresh_token_freqs(db, token_freqs)
        insert_bi_gram_freqs(db, source_id, bi_gram_freqs)
        bi_gram_freqs = dict()

        select_cur.close()
        cnx.close()      

    print 'parsing time: ', time.time() - t
    t = time.time()
    print 'updating...'
    
    db.doc_freq.insert_many(doc_counts)
#         update_doc_bg_counts(db, doc_counts)
    insert_bi_gram_freqs(db, source_id, bi_gram_freqs)