示例#1
0
文件: feature.py 项目: takayuk/says
def getdf_from(items):
    
    df = {}
    for item in items:
        feat = extractd.getngram(item['text']) 

        for w in feat:
            try:
                df[w].add(item["id"])
            except KeyError:
                df[w] = set([ item['id'] ])

    #return [ (w, len(f)) for w, f in df.items() ]
    return df
示例#2
0
文件: feature.py 项目: takayuk/says
def feature(t_begin, t_end, screen_names):

    ngram = {}
    table = {}

    for j, u in enumerate(screen_names):
        query = { 'created_at': { '$gt': t_begin, '$lt': t_end }, 'screen_name': u }
        for item in db.find(query):
            text = item['text']
            id = item['id']

            try:
                replied_id = item['in_reply_to_status_id']
                if replied_id:
                    for ii in db.find({ 'id': replied_id }):
                        text += u'。%s' % ii['text']
                
            except KeyError:
                pass

            """
            feats = bow.bagofwords(text)
            for f in feats:
                print(' '.join(f))
            continue
            """
            feat = extractd.getngram(text)
            for w in set(feat):
                if len(unicode(w)) < 2:
                    continue
                if len(patterns.hiragana.findall(unicode(w))[0]) == len(unicode(w)):
                    continue
                if w in patterns.english_words:
                    continue

                if not w in ngram: ngram[w] = {}
                
                utils.count(ngram[w], u)
                try:
                    #table[w].append(text)
                    table[w].add(id)
                except KeyError:
                    #table[w] = [ text ]
                    table[w] = set([ id ])

            tags = extractd.gethashtags(item)
            for t in set(tags):
                if not t in ngram: ngram[t] = {}
                
                utils.count(ngram[t], u)
                try:
                    #table[t].append(text)
                    table[t].add(id)
                except KeyError:
                    #table[t] = [ text ]
                    table[t] = set([ id ])

            urls = extractd.geturls(item)
            for l in set(urls):
                if not l in ngram: ngram[l] = {}

                utils.count(ngram[l], u)
                try:
                    #table[l].append(text)
                    table[l].add(id)
                except KeyError:
                    #table[l] = [ text ]
                    table[l] = set([ id ])

        print('%d/%d' % (j, len(screen_names)))
    return ngram, table