示例#1
0
文件: feature.py 项目: takayuk/says
def feature(t_begin, t_end, screen_names):

    ngram = {}
    table = {}

    for j, u in enumerate(screen_names):
        query = { 'created_at': { '$gt': t_begin, '$lt': t_end }, 'screen_name': u }
        for item in db.find(query):
            text = item['text']
            id = item['id']

            try:
                replied_id = item['in_reply_to_status_id']
                if replied_id:
                    for ii in db.find({ 'id': replied_id }):
                        text += u'。%s' % ii['text']
                
            except KeyError:
                pass

            """
            feats = bow.bagofwords(text)
            for f in feats:
                print(' '.join(f))
            continue
            """
            feat = extractd.getngram(text)
            for w in set(feat):
                if len(unicode(w)) < 2:
                    continue
                if len(patterns.hiragana.findall(unicode(w))[0]) == len(unicode(w)):
                    continue
                if w in patterns.english_words:
                    continue

                if not w in ngram: ngram[w] = {}
                
                utils.count(ngram[w], u)
                try:
                    #table[w].append(text)
                    table[w].add(id)
                except KeyError:
                    #table[w] = [ text ]
                    table[w] = set([ id ])

            tags = extractd.gethashtags(item)
            for t in set(tags):
                if not t in ngram: ngram[t] = {}
                
                utils.count(ngram[t], u)
                try:
                    #table[t].append(text)
                    table[t].add(id)
                except KeyError:
                    #table[t] = [ text ]
                    table[t] = set([ id ])

            urls = extractd.geturls(item)
            for l in set(urls):
                if not l in ngram: ngram[l] = {}

                utils.count(ngram[l], u)
                try:
                    #table[l].append(text)
                    table[l].add(id)
                except KeyError:
                    #table[l] = [ text ]
                    table[l] = set([ id ])

        print('%d/%d' % (j, len(screen_names)))
    return ngram, table
示例#2
0
文件: extractor.py 项目: takayuk/says
if __name__ == "__main__":

    args = parse_args()

    db = Corpus(database=args.database, collection=args.items)
    db_stats = Corpus(database=args.database, collection=args.itemstats)
    
    try:
        latstats = db_stats.findsorted({}, key="id")[0]["id"]
    except IndexError:
        latstats = 0L

    for i, item in enumerate(db.find({ "id": { "$gt": latstats }})):

        words = extractd.getwords(item)
        messages = extractd.getmessages(item)
        tags = extractd.gethashtags(item)
        urls = extractd.geturls(item)
        
        db_stats.append({
            "screen_name": item["screen_name"]
            , "words": words
            , "messages": messages
            , "hashtags": tags
            , "urls": urls
            , "created_at": item["created_at"]
            , "id": item["id"] })
       
        print(i, item["id"])