Exemplo n.º 1
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")

    used_nouns = get_used_nouns(cur)

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend[total_md5] = 0.0
    logging.info("nouns len %s" % len(nouns))

    logging.info("get sim_dict")
    sim_dict = get_sims(cur)

    cl = get_clusters(sim_dict, nouns, noun_trend)

    json.dump(cl, open("./clusters_raw.json", "w"), indent=2)

    logging.info("Done")
Exemplo n.º 2
0
def main():
    logging.info("Start")

    parser = util.get_dates_range_parser()
    parser.add_argument("-i", "--in-file")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)
    cur = stats.get_main_cursor(DB_DIR)

    stats.create_given_tables(cur, ["noun_similarity"])
    cur.execute(
        "create table if not exists noun_sim_new as select * from noun_similarity limit 0"
    )
    cur.execute("delete from noun_sim_new")

    in_file = open(args.in_file, 'r')
    sims = []
    for line in in_file:
        sims.append(line.split(";"))
        if len(sims) > 20000:
            save_sims(cur, sims)
            sims = []

    save_sims(cur, sims)

    cur.execute("begin transaction")

    cur.execute("delete from noun_similarity")
    cur.execute(
        "insert or ignore into noun_similarity select * from noun_sim_new")

    cur.execute("commit")

    logging.info("Done")
Exemplo n.º 3
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-i")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")
    words_db = DB_DIR + "/tweets_lemma.db"
    bigram_db = DB_DIR + "/tweets_bigram.db"

    used_nouns = get_used_nouns(cur)        

    total_md5 = util.digest("__total__")

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend["__total__"] = 0.0  
    logging.info("nouns len %s" % len(nouns))
    post_cnt = stats.get_noun_cnt(cur_word_cnt)
    post_cnt[total_md5] = 0
    
    logging.info("get sim_dict")
    sim_dict = get_sims(cur) 

    cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt)

    json.dump(cl, open("./clusters_raw.json","w"), indent=2)

    logging.info("Done")
Exemplo n.º 4
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-c", "--clear", action="store_true")
    parser.add_argument("-p", "--profiles-table", default="post_reply_cnt")
    parser.add_argument("-o", "--out-file")
    args = parser.parse_args()

    cur = stats.get_cursor(DB_DIR + "/word_cnt.db")

    profiles_dict = stats.setup_noun_profiles(
        cur, {}, {},
        post_min_freq=POST_MIN_FREQ,
        blocked_nouns=BLOCKED_NOUNS,
        nouns_limit=NOUNS_LIMIT,
        db_dir=DB_DIR,
        profiles_table=args.profiles_table,
        trash_words=settings["trash_words"],
        swear_words=settings["swear_words"])

    logging.info("profiles len %s" % len(profiles_dict))
    profiles_dump = {}
    for p in profiles_dict:
        profiles_dump[p] = profiles_dict[p].replys

    json.dump(profiles_dump, open(args.out_file, 'w'))
Exemplo n.º 5
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    logging.info("Clear words for date %s" % args.start)
    cur = ind.get_db_for_date(args.start)

    #cur.execute("begin transaction")
    cur.execute("delete from tomita_progress")
    cur.execute("delete from tweets_nouns")
    cur.execute("delete from tweets_words")
    #cur.execute("delete from word_pairs")

    #cur.execute("commit")
    logging.info("Done")
Exemplo n.º 6
0
def main():
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    for date in sorted(ind.dates_dbs.keys()):
        if args.start is not None and date < args.start:
            continue
        if args.end is not None and date > args.end:
            continue

        cur = ind.get_db_for_date(date)
        try:
            cur.execute("select 1 from nouns")
        except Exception as ex:
            logging.info("Skip date %s" % date)
            continue

        stats.create_tables(cur)

        stats.fill_tweet_chains(cur)

    logging.info("Done")
Exemplo n.º 7
0
import logging, logging.config
import json

from molva.Indexer import Indexer
import molva.util as util


logging.config.fileConfig("logging.conf")

settings = {} 
try:
    settings = json.load(open('global-settings.json', 'r'))
except Exception as e:
    logging.warn(e)

DB_DIR = settings["db_dir"] if "db_dir" in settings else os.environ["MOLVA_DIR"]

if __name__ == '__main__':
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)
    for date in sorted(ind.dates_dbs.keys()):
        if args.start is not None and date < args.start:
            continue
        if args.end is not None and date > args.end:
            continue
        ind.prepare_tweet_index_for_date(date)

    logging.info("Done")