示例#1
0
def get_training_data():
    print '获取样本...'
    Draft = namedtuple('Draft', 'id, sid, pic, snum, lnum, author, text, utime, ctime, status')
    conn = engine.connect()

    rs = conn.execute('select * from sample where status=0')
    trash_tweets = map(Draft._make, rs)

    count_trash = len(trash_tweets)
    print '垃圾样本数量'

    rs = conn.execute('select * from entry')
    snap_tweets = map(Draft._make, rs)


    tweets = trash_tweets + snap_tweets
    print '随机打乱样本顺序...'
    random.shuffle(tweets, random.random)

    print '获取特征表...'
    top = get_top_list(tweets=(t.text for t in tweets))
    words = [w for w, v in top]

    print '特征表存入redis...'
    db = redis.StrictRedis()
    db.set('features', simplejson.dumps(words))

    print '特征表写入本地文件...'
    with open('feature_words.txt', 'w') as f:
        f.writelines((w + '\n' for w in words))
        f.close()

    def build_x(text):
        text = url_re.sub('', text)
        w_list = seg.cut(text.strip())
        w_list.reverse()
        w_list = [w.encode('utf-8') for w in w_list]
        fs = []
        for w in words:
            if w in w_list:
                fs.append(1)
            else:
                fs.append(0)
        return fs

    fx = []
    fy = []
    fd = []

    print '构建fx, fy, fd'
    for t in tweets:
        features = build_x(t.text)
        fx.append(features)
        status = 1
        if not t.status:
            status = -1
        fy.append(status)
        item = (t.pic, str(t.sid))
        fd.append(item)
    return fy, fx, fd
示例#2
0
文件: cron.py 项目: senryxie/GN-Drive
def download_snap_timeline():
    page = 1
    all=[]

    tweets = client.trends__statuses(trend_name='街拍', page=page)
    while tweets:
        for i in tweets:
            id = i['id']
            pic = i.get('bmiddle_pic')
            author = i['user']['id']
            text = i['text'].encode('utf-8')
            retweet = i.get('retweeted_status')
            if retweet:
                id = retweet['id']
                text = retweet['text'].encode('utf-8')
                author = retweet['user']['id']
            if pic:
                text = text.replace("'", '"')
                pic = pic.encode('utf-8')
                feature = (id, pic, author, text)
                all.append(feature)
        page += 1
        tweets = []
        try:
            tweets = client.trends__statuses(trend_name='街拍', page=page)
        except HTTPError as e:
            if str(e.code) in ('400', '403', '401'):
                '''bad request or forbidden'''
                print '微博接口%s错误', e.code
            else:
                raise

        if page > 5:
            break

    #
    #
    #

    #mysql engine
    conn = engine.connect()

    #svm predict
    dup = 0
    selected = set()
    for t in all:
        is_ban = False
        for ban in baned_list:
            if ban in t[3]:
                is_ban = True
                break

        #是否ban掉
        if is_ban:
            continue

        rs = conn.execute('select * from draft where sid=%s' % t[0])
        rs = map(Draft._make, rs)
        if len(rs):
            dup += 1
            continue

        text = t[3]
        if predict(text):
            id, pic, author, text = t
            passed = True

            try:
                r = urllib2.urlopen(pic.replace('bmiddle', 'thumbnail'))
                im = open_pic(r.read())
                width, height = im.size
                if height / width > 1.7777 or width / height > 1.7777:
                    passed = False
            except :
                pass
                #print '抓取、分析图片异常'
                #import traceback; traceback.print_exc()
            if passed:
                selected.add(t)

    #save to draft
    count = 0
    for line in selected:
        id, pic, author, text = line
        try:
            conn.execute('insert into draft (sid, pic, author, text, create_time) \
              values(%s,"%s",%s,"%s", now())' % line)
            print '入库:', id, pic, author, text
            count += 1
        except:
            pass
            #print '重复入库:', id, pic, author, text
    conn.close()
    return len(all), len(selected), dup