def feature(t_begin, t_end, screen_names): ngram = {} table = {} for j, u in enumerate(screen_names): query = { 'created_at': { '$gt': t_begin, '$lt': t_end }, 'screen_name': u } for item in db.find(query): text = item['text'] id = item['id'] try: replied_id = item['in_reply_to_status_id'] if replied_id: for ii in db.find({ 'id': replied_id }): text += u'。%s' % ii['text'] except KeyError: pass """ feats = bow.bagofwords(text) for f in feats: print(' '.join(f)) continue """ feat = extractd.getngram(text) for w in set(feat): if len(unicode(w)) < 2: continue if len(patterns.hiragana.findall(unicode(w))[0]) == len(unicode(w)): continue if w in patterns.english_words: continue if not w in ngram: ngram[w] = {} utils.count(ngram[w], u) try: #table[w].append(text) table[w].add(id) except KeyError: #table[w] = [ text ] table[w] = set([ id ]) tags = extractd.gethashtags(item) for t in set(tags): if not t in ngram: ngram[t] = {} utils.count(ngram[t], u) try: #table[t].append(text) table[t].add(id) except KeyError: #table[t] = [ text ] table[t] = set([ id ]) urls = extractd.geturls(item) for l in set(urls): if not l in ngram: ngram[l] = {} utils.count(ngram[l], u) try: #table[l].append(text) table[l].add(id) except KeyError: #table[l] = [ text ] table[l] = set([ id ]) print('%d/%d' % (j, len(screen_names))) return ngram, table
if __name__ == "__main__": args = parse_args() db = Corpus(database=args.database, collection=args.items) db_stats = Corpus(database=args.database, collection=args.itemstats) try: latstats = db_stats.findsorted({}, key="id")[0]["id"] except IndexError: latstats = 0L for i, item in enumerate(db.find({ "id": { "$gt": latstats }})): words = extractd.getwords(item) messages = extractd.getmessages(item) tags = extractd.gethashtags(item) urls = extractd.geturls(item) db_stats.append({ "screen_name": item["screen_name"] , "words": words , "messages": messages , "hashtags": tags , "urls": urls , "created_at": item["created_at"] , "id": item["id"] }) print(i, item["id"])