示例#1
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--dir")
    parser.add_argument("--num")
    parser.add_argument("--clusters")
    parser.add_argument("--clusters-out")

    args = parser.parse_args()

    f_out = codecs.open(args.clusters_out, 'w', encoding="utf8")

    today = date.today().strftime('%Y%m%d')
    ystd = (date.today() - timedelta(1)).strftime('%Y%m%d')

    cl = json.load(codecs.open(args.clusters, 'r', encoding="utf8"))

    today_time = (datetime.utcnow()).strftime("%Y%m%d%H%M%S")
    update_time = (datetime.now()).strftime("%Y-%m-%d %H:%M:%S")

    cur1 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, today))
    cur2 = stats.get_cursor("%s/tweets_%s.db" % (args.dir, ystd))

    rel_tweets = []
    for x in cl:
        put_trend(x)
    filtered_cl = [x for x in cl if x["trend"] > 0.0]
    logging.info("Filtered out %d of %d (trend > 0.0)" %
                 (len(cl) - len(filtered_cl), len(cl)))

    top_cl = sorted(cl, key=lambda x: x["trend"], reverse=True)[:15]
    tw_with_embed_cnt = 0
    for cluster in top_cl:
        r = get_relevant_tweets(cur1, cur2, cluster)
        rel_tweets.append(r)
        cluster["topic_density"] = r["density"]

    logging.info("Have %d topics with tweets embeds out of %d" %
                 (tw_with_embed_cnt, len(top_cl)))

    cur_rel = stats.get_cursor("%s/tweets_relevant.db" % args.dir)
    stats.create_given_tables(cur_rel, ["relevant"])
    save_relevant(cur_rel, today_time, rel_tweets)

    final_cl = {
        "clusters": top_cl,
        "update_time": update_time,
        "cluster_id": today_time
    }
    cl_json = json.dump(final_cl, f_out)
    f_out.close()

    return
示例#2
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")

    used_nouns = get_used_nouns(cur)

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend[total_md5] = 0.0
    logging.info("nouns len %s" % len(nouns))

    logging.info("get sim_dict")
    sim_dict = get_sims(cur)

    cl = get_clusters(sim_dict, nouns, noun_trend)

    json.dump(cl, open("./clusters_raw.json", "w"), indent=2)

    logging.info("Done")
示例#3
0
def main2():
    parser = argparse.ArgumentParser()

    parser.add_argument("--dir")
    parser.add_argument("--query")

    args = parser.parse_args()

    words = get_words_from_query(args.query.decode('utf8'))

    print_list(words)

    if args.db is None:
        print "Need --db"
        return

    cur = stats.get_cursor(args.db)

    fill_lemmas(cur, words)

    print_list(words)

    tweets = lookup_two_days(cur, cur, words)
    # len(tweets[x].words),
    #for t in sorted(tweets.keys(), key=lambda x: (tweets[x].created_at), reverse=True)[:10]:
    #    print tweets[t].__str__()

    return
示例#4
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-i")
    args = parser.parse_args()

    ind = Indexer(DB_DIR)

    cur = stats.get_main_cursor(DB_DIR)
    cur_word_cnt = stats.get_cursor(DB_DIR + "/word_cnt.db")
    words_db = DB_DIR + "/tweets_lemma.db"
    bigram_db = DB_DIR + "/tweets_bigram.db"

    used_nouns = get_used_nouns(cur)        

    total_md5 = util.digest("__total__")

    nouns = stats.get_nouns(cur, used_nouns)
    noun_trend = stats.get_noun_trend(cur)
    nouns[total_md5] = "__total__"
    noun_trend["__total__"] = 0.0  
    logging.info("nouns len %s" % len(nouns))
    post_cnt = stats.get_noun_cnt(cur_word_cnt)
    post_cnt[total_md5] = 0
    
    logging.info("get sim_dict")
    sim_dict = get_sims(cur) 

    cl = get_clusters(args, sim_dict, nouns, noun_trend, post_cnt)

    json.dump(cl, open("./clusters_raw.json","w"), indent=2)

    logging.info("Done")
示例#5
0
def main():
    logging.info("start")
    parser = util.get_dates_range_parser()
    parser.add_argument("-c", "--clear", action="store_true")
    parser.add_argument("-p", "--profiles-table", default="post_reply_cnt")
    parser.add_argument("-o", "--out-file")
    args = parser.parse_args()

    cur = stats.get_cursor(DB_DIR + "/word_cnt.db")

    profiles_dict = stats.setup_noun_profiles(
        cur, {}, {},
        post_min_freq=POST_MIN_FREQ,
        blocked_nouns=BLOCKED_NOUNS,
        nouns_limit=NOUNS_LIMIT,
        db_dir=DB_DIR,
        profiles_table=args.profiles_table,
        trash_words=settings["trash_words"],
        swear_words=settings["swear_words"])

    logging.info("profiles len %s" % len(profiles_dict))
    profiles_dump = {}
    for p in profiles_dict:
        profiles_dump[p] = profiles_dict[p].replys

    json.dump(profiles_dump, open(args.out_file, 'w'))
示例#6
0
    def get_clusters(self, skip, before, date):
        cur = stats.get_cursor(settings["db_dir"] + "/tweets_display.db")
        if date is not None:
            cur.execute("""
                select cluster 
                from clusters 
                where cluster_date <= '%(date)s'
                order by cluster_date desc
                limit 1
            """ % ({
                'date': date
            }))
        elif before is not None:
            cur.execute("""
                select cluster 
                from clusters 
                where cluster_date < '%(before)s'
                order by cluster_date desc 
                limit 1 
            """ % ({
                'before': before
            }))
        else:
            cur.execute("""
                select cluster 
                from clusters 
                order by cluster_date desc 
                limit 1 
                offset %s
            """ % (skip))
        res = cur.fetchone()[0]

        return res
示例#7
0
def get_trending_words(db_dir, word_cnt_tuples):
    cur = stats.get_cursor(db_dir + "/tweets_display.db")

    stats.create_given_tables(cur, ["noun_trend"])
    cur.execute("""
        select noun_md5, trend 
        from noun_trend
        order by trend desc
        limit 2000
    """)
    word_trends = map(lambda x: (int(x[0]), float(x[1])), cur.fetchall())

    word_ranks = make_tf_idf_ranks(word_cnt_tuples)

    for w in word_trends:
        word, trend = w
        if word not in word_ranks:
            logging.warn("No such word_md5 at word_ranks %s" % word)
            continue
        word_ranks[word].trend.value = trend

    Rank.weight_ranks(map(lambda x: x.trend, word_ranks.values()))
    Rank.weight_ranks(map(lambda x: x.cnt, word_ranks.values()))

    words = []
    for word_rank in sorted(word_ranks.values(),
                            key=lambda x: x.cnt.rank + x.trend.rank)[:2000]:
        words.append(str(word_rank.word))

    return words
示例#8
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--clusters")
    parser.add_argument("--out-bigram-stats")
    args = parser.parse_args()

    cur_display = stats.get_cursor(DB_DIR + "/tweets_display.db")

    cl = json.load(open(args.clusters, 'r'))

    word_stats = get_word_stats(cl)
    bigram_stats = get_bigram_stats(cl, word_stats)

    nouns = get_nouns(cl)
    sources = get_sources(bigram_stats)

    bigram_stats2 = apply_word_text(bigram_stats, nouns, sources)

    logging.info("Got stats for: %d words; %d bigrams" %
                 (len(word_stats), len(bigram_stats)))

    json.dump(bigram_stats2,
              codecs.open(args.out_bigram_stats, 'w', encoding="utf8"),
              indent=2,
              ensure_ascii=False)
示例#9
0
    def get_db_for_filename(self, filename):
        if filename in self.db_curs and self._check_cursor_alive(self.db_curs[filename]):
            return self.db_curs[filename]
        else:
            self.log.info("Setup db connection  " + filename)
            cur = stats.get_cursor(filename)
            self.db_curs[filename] = cur

            return cur
示例#10
0
    def get_db_for_date(self, date):
        date = date[:8]  # assume date format %Y%m%d_%H%M%S

        if date in self.dates_db:
            return self.dates_db[date]
        else:
            self.log.info("Setup db connection for date " + date)
            cur = stats.get_cursor(self.db_dir + "/tweets_" + date + ".db")
            self.dates_db[date] = cur
            stats.create_given_tables(cur, ["tweets"])

            return cur
示例#11
0
    def get_relevant(self, date):
        cur = stats.get_cursor(settings["db_dir"] + "/tweets_relevant.db")
        if date is not None:
            cur.execute("""
                select relevant 
                from relevant
                where cluster_date = '%(date)s'
            """ % ({
                'date': date
            }))

        res = cur.fetchone()[0]

        return res
示例#12
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--filter-spam", action="store_true")
    parser.add_argument("--show-name", action="store_true")

    args = parser.parse_args()

    cur = stats.get_cursor(settings["db_dir"] + "/quality_marks.db")
    
    m = get_marks(cur, args.filter_spam, args.show_name)
    print m

    return
示例#13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--clusters")
    args = parser.parse_args()

    cur_display = stats.get_cursor(DB_DIR + "/tweets_display.db")

    final_cl_raw = codecs.open(args.clusters, 'r',encoding="utf8").read()
    final_cl = json.loads(final_cl_raw)

    cur_display.execute("""
        replace into clusters (cluster_date, cluster)
        values (?, ?)
    """, (final_cl["cluster_id"], final_cl_raw))
示例#14
0
    def post(self):
        req_data = None
        try:
            req_data = json.loads(self.request.body)

            if req_data is not None:
                cur = stats.get_cursor(settings["db_dir"] +
                                       "/quality_marks.db")
                stats.create_given_tables(cur, ["quality_marks"])
                username = ""
                if "username" in req_data and req_data["username"] is not None:
                    username = req_data["username"]
                update_time = ""
                if "update_time" in req_data and req_data[
                        "update_time"] is not None:
                    update_time = req_data["update_time"]
                    update_time = int(re.sub('[-\s:]', '', update_time))
                exp_name = ""
                if "experiment_name" in req_data and req_data[
                        "experiment_name"] is not None:
                    exp_name = req_data["experiment_name"]
                exp_descr = ""
                if "experiment_descr" in req_data and req_data[
                        "experiment_descr"] is not None:
                    exp_descr = req_data["experiment_descr"]

                cur.execute(
                    """
                    insert into quality_marks 
                    (update_time, username, exp_name, exp_descr,  marks) 
                    values (?, ?, ?, ?, ?)
                """, (update_time, username, exp_name, exp_descr,
                      json.dumps(req_data["marks"])))

        except Exception as e:
            logging.error(e)
            raise (e)

        self.write("")
示例#15
0
def build_post_cnt(db_dir):
    utc_now = datetime.utcnow()
    word_cnt = stats.get_word_cnt(db_dir)
    word_cnt_tuples = map(lambda x: (int(x), int(word_cnt[x])),
                          word_cnt.keys())

    f_tmp = db_dir + "/word_cnt.db.tmp"
    f = db_dir + "/word_cnt.db"

    util.delete_if_exists(f_tmp)

    cur = stats.get_cursor(f_tmp)
    stats.create_given_tables(cur,
                              ["chains_nouns", "post_cnt", "post_reply_cnt"])

    save_word_cnt(cur, word_cnt_tuples)
    words = get_trending_words(db_dir, word_cnt_tuples)

    mcur = stats.get_mysql_cursor(settings)
    count_currents2(cur, mcur, utc_now, words)

    os.rename(f_tmp, f)
示例#16
0
def parse_facts_file(tweet_index, facts, date):
    ind = Indexer(DB_DIR)

    cur = ind.get_db_for_date(date)
    cur_main = stats.get_main_cursor(DB_DIR)
    cur_bigram = stats.get_cursor(DB_DIR + "/tweets_bigram.db")

    mcur = stats.get_mysql_cursor(settings)
    word_time_cnt_table = "word_time_cnt_%s" % date
    word_hour_cnt_table = "word_hour_cnt_%s" % date
    word_mates_table = "word_mates_%s" % date
    bigram_table = "bigram_%s" % date
    stats.create_mysql_tables(
        mcur, {
            word_time_cnt_table: "word_time_cnt",
            word_hour_cnt_table: "word_hour_cnt",
            word_mates_table: "word_mates",
            bigram_table: "bigram_day"
        })

    stats.create_given_tables(
        cur, ["nouns", "tweets_nouns", "tweets_words", "lemma_word_pairs"])
    stats.create_given_tables(cur_bigram, ["lemma_word_pairs"])
    stats.create_given_tables(cur, {"sources": "nouns"})
    stats.create_given_tables(cur_main, ["nouns"])
    stats.create_given_tables(cur_main, {"sources": "nouns"})

    logging.info("Parse index: %s; facts: %s" % (tweet_index, facts))

    ids = []
    for l in open(tweet_index, 'r').read().split("\n"):
        if l is None or l == '':
            break
        tw_id, created_at = l.split("\t")
        ids.append((tw_id, created_at))

    logging.info("Got tweet %s ids" % (len(ids)))

    tree = ElementTree.iterparse(facts, events=('start', 'end'))

    # set larger cache, default 2000 * 1024, this 102400*1024
    #cur_bigram.execute("pragma cache_size = -102400")

    nouns_total = set()
    sources_total = set()
    noun_sources = []
    tweets_nouns = []
    lemma_word_pairs = []
    word_mates = []
    word_cnt = []

    match_type_cnt = MatchTypeCnt()

    for event, elem in tree:
        if event == 'end' and elem.tag == 'document':
            cur_doc = int(elem.attrib['di'])
            post_id, create_time = ids[cur_doc - 1]
            nouns_preps = get_nouns_preps(elem)
            match_type_cnt.add_cnt(nouns_preps)
            lemmas = []
            nouns = []
            for np in nouns_preps:
                try:
                    lemmas.append(util.digest(np.with_prep()))
                    nouns.append(util.digest(np.noun_lemma))
                    nouns_total.add(np.noun_lemma)
                    sources_total.add(np.with_prep())

                    noun_sources.append((post_id, util.digest(np.noun_lemma),
                                         util.digest(np.with_prep())))
                    word_cnt.append((util.digest(np.noun_lemma),
                                     cut_to_tenminute(create_time)))
                except Exception as e:
                    traceback.print_exc()
                    logging.error(e)

            lemma_word_pairs += make_lemma_word_pairs(
                nouns, lemmas, cut_to_tenminute(create_time))
            word_mates += make_word_pairs_with_time(nouns,
                                                    create_time,
                                                    bag_size=BAG_SIZE)

            if len(noun_sources) > 10000:
                logging.info("seen %s docid" % (cur_doc))
                save_tweet_nouns(cur, noun_sources)
                save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
                save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
                noun_sources = []
                word_cnt = []

            if len(lemma_word_pairs) >= CHUNK_SIZE:
                save_bigram_day(mcur, lemma_word_pairs, bigram_table)
                lemma_word_pairs = []

            if len(word_mates) >= CHUNK_SIZE:
                logging.info("save %s word_mates" % len(word_mates))
                save_word_mates2(mcur, word_mates, word_mates_table)
                word_mates = []

            elem.clear()

    save_tweet_nouns(cur, noun_sources)
    save_word_time_cnt2(mcur, word_cnt, word_time_cnt_table)
    save_word_hour_cnt(mcur, word_cnt, word_hour_cnt_table)
    save_bigram_day(mcur, lemma_word_pairs, bigram_table)
    save_word_mates2(mcur, word_mates, word_mates_table)

    save_nouns(cur, nouns_total)
    save_nouns(cur, sources_total, table="sources")
    save_nouns(cur_main, nouns_total)
    save_nouns(cur_main, sources_total, table="sources")

    logging.info(str(match_type_cnt))

    return
示例#17
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--db-dir", default=DB_DIR)
    args = parser.parse_args()

    cur_display = stats.get_cursor(args.db_dir + "/tweets_display.db")
    cur_main = stats.get_cursor(args.db_dir + "/tweets.db")
    #cur_main = stats.get_cursor(args.db_dir + "/tweets_20150221.db")
    #nouns = stats.get_nouns(cur_main)

    #logging.info(type(nouns.keys()[0]))

    utc_now = datetime.utcnow()
    date_3day = (utc_now - timedelta(3)).strftime("%Y%m%d%H%M%S")
    date_3day_tenminute = date_3day[:11]
    logging.info("Time left bound: %s" % date_3day_tenminute)
    hour_word_cnt = {}
    word_cnt = {}
    for day in [3, 2, 1, 0]:
        date = (utc_now - timedelta(day)).strftime("%Y%m%d")
        word_time_cnt_table = "word_time_cnt_%s" % date
        mcur = stats.get_mysql_cursor(settings)
        stats.create_mysql_tables(mcur, {word_time_cnt_table: "word_time_cnt"})
        mcur.execute("""
                select word_md5, substr(tenminute, 1, 10) as hour, sum(cnt) 
                from %s
                where tenminute > %s
                group by word_md5, hour
        """ % (word_time_cnt_table, date_3day_tenminute))

        row_cnt = 0
        while True:
            res = mcur.fetchone()
            if res is None:
                break
            word_md5, hour, cnt = map(int, res)
            if hour not in hour_word_cnt:
                hour_word_cnt[hour] = {}
            hour_word_cnt[hour][word_md5] = cnt
            if word_md5 not in word_cnt:
                word_cnt[word_md5] = 0
            word_cnt[word_md5] += cnt

            row_cnt += 1
            if row_cnt % 100000 == 0:
                logging.info('Seen %s rows' % row_cnt)

    word_series = []
    hours = sorted(hour_word_cnt.keys())
    for word in word_cnt.keys():
        series = []
        series_max = 0
        for hour in hours:
            if word in hour_word_cnt[hour]:
                series.append(hour_word_cnt[hour][word])
                if hour_word_cnt[hour][word] > series_max:
                    series_max = hour_word_cnt[hour][word]
            else:
                series.append(0)
        # normalize by maxfreq in series
        if series_max > 0:
            series = [(float(x) / series_max) * 100 for x in series]
        approx = least_squares(series)
        a, b, app_ser = approx
        word_series.append({
            "word_md5": word,
            "word_cnt": word_cnt[word],
            "line_c": a,
            "slope": b,
            "delta": app_ser[-1] - app_ser[0]
        })

    word_series = sorted(word_series, key=lambda x: x["slope"],
                         reverse=True)[:2000]

    for cur in [cur_main, cur_display]:
        stats.create_given_tables(cur, {"noun_trend_new": "noun_trend"})
        cur.execute("begin transaction")
        for s in word_series:
            cur.execute("insert into noun_trend_new values (%s, %s)" %
                        (s["word_md5"], s["slope"]))

        cur.execute("drop table noun_trend")
        cur.execute("alter table noun_trend_new rename to noun_trend")
        cur.execute("commit")

    logging.info("Done")