示例#1
0
# -*- coding: utf8 -*-

from MySQLServer import MySQLServer

if __name__ == '__main__':
    # start
    database = MySQLServer('./config/mysql_account.json')
    database.start()

    # create db
    database.create_db('twitter')

    # use db
    database.use_db('twitter')

    # create table
    database.create_table('./config/table_User.json')
    database.create_table('./config/table_TimeLine.json')
    database.create_table('./config/table_Image.json')

    # stop
    database.stop()
def main():
    """
    涉及的数据库
        个人信息表  User
        twitter表  TimeLine
        图片表  Image

    选取研究对象
        存储个人信息(以TEXT)
        获取此人的所有twitter(包含转发等,以TEXT存储,设置twitter上限)
            如果twitter中包含图片,那么下载图片

    while:
        获得部分followers
        将获得的内容加入个人信息表

        for one_follower in followers:
            获得所有twitter (要不要设置max page)
            获得所有的评论(还存在问题吧)
        if next_cursor == ? (已经没有follower了)
            break

    :return:
    """
    # database
    database = MySQLServer('./config/mysql_account.json')
    database.start()
    database.use_db('twitter')

    # crawler
    crawler = TwitterCrawler()
    user_id = '813286'
    # 存储个人信息(以TEXT)
    try:
        user = crawler.get_user(user_id=user_id)
    except:
        traceback.print_exc()
        return None
    user = user.replace('\"', '\\\"')
    user = user.replace('\;', '\\\;')
    database.insert(table_name='User', values={
        'user_id': user_id,
        'content': user
    })
    # 获取此人的所有twitter(包含转发等,以TEXT存储,设置twitter上限)
    # 如果twitter中包含图片,那么下载图片
    num = 1
    limit = 20
    while num <= limit:
        while True:
            twitters = None
            try:
                twitters = crawler.get_twitter_page(user_id=user_id,
                                                    page=num)
            except:
                traceback.print_exc()
                sleep(65)
                continue
            # 将内容以TEXT存入数据库
            for twitter in twitters:
                twitter_str = str(twitter)
                twitter_str = twitter_str.replace('\"', '\\\"').replace('\;', '\\\;')
                twitter_dict = twitter.AsDict()
                database.insert('TimeLine', values={
                    'user_id': str(user_id),
                    'twitter_id': str(twitter_dict['id']),
                    'content': twitter_str
                })
            if len(twitters) == 0:
                num = limit
            break
        print('get twitter page : ' + str(num))
        num += 1
        sleep(65)

    # 遍历好友
    while True:
        # 获得部分followers
        # 将获得的内容加入个人信息表
        print('start get follows')
        followers = crawler.get_followers(user_id=user_id)
        print('end get follows')
        for follower in followers:
            follower_str = str(follower).replace('\"', '\\\"').replace('\;', '\\\;')
            follower_dict = follower.AsDict()
            database.insert('User', values={
                'user_id': str(follower_dict['id']),
                'content': follower_str
            })

            # 获得所有twitter(要不要设置max page)
            # 获得所有的评论(还存在问题吧)
            _num = 1
            _limit = 5
            while _num <= _limit:
                while True:
                    # _twitters = None
                    try:
                        print('start get twitter page')
                        _twitters = crawler.get_twitter_page(user_id=follower_dict['id'],
                                                             page=_num)
                        print('end get twitter page')
                    except TwitterError:
                        traceback.print_exc()
                        _num = _limit
                        break
                    except:
                        traceback.print_exc()
                        sleep(65)
                        continue
                    # 将内容以TEXT存入数据库
                    for twitter in _twitters:
                        twitter_str = str(twitter)
                        twitter_str = twitter_str.replace('\"', '\\\"').replace('\;', '\\\;')
                        twitter_dict = twitter.AsDict()
                        database.insert('TimeLine', values={
                            'user_id': str(follower_dict['id']),
                            'twitter_id': str(twitter_dict['id']),
                            'content': twitter_str
                        })
                        # print('twitter_id : ' + str(twitter_dict['id']))
                    print('twitter num : ' + str(len(_twitters)))
                    if len(_twitters) == 0:
                        _num = _limit
                    break
                print(str(follower_dict['id']) + ' get twitter page : ' + str(_num))
                _num += 1
                print('start sleep')
                sleep(65)
                print('end sleep')

        if crawler.follower_is_none():
            break