/
make_index.py
74 lines (61 loc) · 2.17 KB
/
make_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
from os import path
import argparse
import logging
import time
from progressbar import ProgressBar, Percentage, Bar
from make_dic import noun_list
import sqlconfig
from sqltostc import read_table
from index import Indexer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger('make_index')
parser = argparse.ArgumentParser()
parser.add_argument('--file-path', type=str, default='./index.pkl')
parser.add_argument('--overwrite', type=bool, default=False)
parser.add_argument('--only-reply', type=bool, default=False)
args = parser.parse_args()
def get_query(only_reply):
if only_reply:
return "SELECT t.item_id, t.text FROM stc_tweet_ids AS ids INNER JOIN stc_tweets AS t ON ids.reply_id = t.item_id WHERE t.success = 1"
else:
return "SELECT item_id, text FROM " + sqlconfig.tweet_table_name + " WHERE success = 1"
def save_index(file_path):
query = get_query(args.only_reply)
logger.info('query: {}'.format(query))
logger.info("SQL running...")
start = time.time()
rows = read_table(query)
elapsed_time = time.time() - start
logger.info("sql_time:{0}[sec]".format(elapsed_time))
logger.info("Indexing...")
start = time.time()
p = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(rows)).start()
indexer = Indexer()
for i, row in enumerate(rows):
indexer.add(row[0], noun_list(row[1]))
p.update(i + 1)
p.finish()
elapsed_time = time.time() - start
logger.info("indexing_time:{0}[sec]".format(elapsed_time))
logger.info("Saving...")
indexer.save(file_path)
logger.info('Done')
if __name__ == "__main__":
if args.only_reply:
file_path = './only_reply_index.pkl'
else:
file_path = args.file_path
if path.isfile(file_path):
logger.info('{} already exists'.format(file_path))
if args.overwrite:
logger.info('overwrite flag is True')
save_index(file_path)
else:
dirname = path.dirname(file_path)
if not path.isdir(dirname):
logger.info('create {}'.format(dirname))
os.mkdir(dirname)
save_index(file_path)