Exemplo n.º 1
0
    def run(self):
        while True:
            start_time = time.time()
            dbi = MySQL_Interface()
            col_info = dbi.get_col_name('cache_history')
            query = 'select * from cache_history where is_dealing is null order by checkin_timestamp limit 1'

            mysql_res = dbi.select_asQuery(query)
            if mysql_res.__len__() == 0:  # cache_history表为空时,睡眠1秒,跳过此次循环
                time.sleep(1)
                continue

            mysql_res = mysql_res[0]

            # todo for delete-----
            print('debug->start to deal with a new task')
            print('debug->mysql_res: ')
            print(mysql_res)
            #------------------------

            container_id = mysql_res[col_info.index('container_id')]
            print('debug->container_id: {cid}'.format(cid=container_id))
            latest_time = mysql_res[col_info.index('latest_time')]
            latest_timestamp = mysql_res[col_info.index('latest_timestamp')]
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(
                time=time_stick, cid=container_id)
            # todo for delete-----
            print('debug->query1 : {q}'.format(q=query))
            # ------------------------
            dbi.update_asQuery(query)

            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            assemble_table = db.assemble_factory
            res = assemble_table.find({'container_id': container_id}, {
                'current_id': 1,
                'total_num': 1
            })
            id_list = [x['current_id'] for x in res]
            num = int([
                x['total_num']
                for x in assemble_table.find({
                    'container_id': container_id
                }).limit(1)
            ][0])
            ## todo for delete-----
            print('debug->id_list_len: {len}'.format(len=id_list.__len__()))
            print('debug->num: {n}'.format(n=num))
            # ------------------------
            # 检查是否所有包裹已经到齐
            check_state = True
            if id_list.__len__() < num:
                print(
                    'server->HistoryReport:The package is not complete, retry to catch data'
                )
                check_state = False

            if check_state:
                # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb
                # 将装配车间中的相关数据删除
                # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog

                # 从mysql获取该用户信息
                try:
                    query = 'select * from user_info_table where container_id=\'{cid}\'' \
                        .format(cid=container_id)
                    user_info = dbi.select_asQuery(query)[0]
                    # todo fro debug-------------
                    print('task {cid} :debug->query2: {q}'.format(
                        q=query, cid=container_id))
                    print('task {cid} debug->user_info:'.format(
                        cid=container_id))
                    print(user_info)
                    # --------------------------------
                    col_name = dbi.get_col_name('user_info_table')
                except Exception as e:
                    print(
                        'task {cid} :Error:server-HistoryReturn:'
                        'No such user in MySQL.user_info_table,Reason:'.format(
                            cid=container_id))
                    print(e)

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find(
                        {'container_id': container_id}, {
                            'data': 1,
                            'current_id': 1
                        })
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    # todo fro debug-------------
                    print('task {cid} debug->datalist: {len}'.format(
                        len=data_list.__len__(), cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print(
                        'Error:server-HistoryReturn:'
                        'Unable to get data from MongoDB, assemble factory,Reason:'
                    )
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if id_list.__len__() > num:
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()):
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                            # print('data_list.len :{len}'.format(len=data_list.__len__()))
                            # print('id_list.len :{len}'.format(len=id_list.__len__()))
                            # print(i)
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final = []
                    for i in data_list:
                        data_final = data_final + i
                    # todo fro debug-------------
                    print('task {cid} debug->数据拼接完毕,len {len}'.format(
                        len=data_final.__len__(), cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print(
                        'Error:server-HistoryReport:'
                        'Unable to contact the pieces of information,Reason:')
                    print(e)

                # 将本次信息录入accuracy_table 用以进一步分析
                blog_len = data_final.__len__()
                wanted_blog_len = user_info[col_name.index('blog_num')]
                blog_accuracy = blog_len / wanted_blog_len
                time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))
                query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \
                    .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len)
                dbi.insert_asQuery(query)

                # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容
                try:
                    if not user_info[col_name.index('update_time')]:
                        # 将数据存入 Mongodb 的formal collection
                        save_data_seperately(data_final)
                        print(
                            'task {cid} Success: Data has saved in Mongodb, size is {size}'
                            .format(size=sys.getsizeof(data_final),
                                    cid=container_id))

                        # # 将关键信息录入Mydql
                        query = 'update user_info_table set ' \
                              'update_time=\'{up_time}\',' \
                              'latest_blog=\'{latest_blog}\',' \
                              'isGettingBlog=null ' \
                              'where container_id=\'{cid}\';'\
                            .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        # query='update user_info_table set ' \
                        #       'update_time=\'{up_time}\',' \
                        #       'latest_blog=\'{latest_blog}\'' \
                        #       'where container_id=\'{cid}\';' \
                        #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的
                        dbi.update_asQuery(query)
                        print(
                            'task {cid} Success: insert user into MongoDB, the num of data is {len}'
                            .format(len=blog_len, cid=container_id))
                    else:
                        query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                            .format(cid=container_id)
                        dbi.update_asQuery(query)

                except Exception as e:
                    print('task {cid} Error:server->HistoryReport:'
                          'Reason:'.format(cid=container_id))
                    print(e)
            else:
                # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除
                print(
                    'task {cid} :Error: the package is not complete ,{a} of {b}'
                    .format(a=id_list.__len__(), b=num, cid=container_id))
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            # 将数据从assemble factory 去掉
            assemble_table.remove({'container_id': container_id})
            print(
                'task {cid} Success: Data has been removed from assemble factory'
                .format(cid=container_id))

            # 将cache_history中的相应行删掉,表示已经处理完该事物了
            query='delete from cache_history where container_id=\'{cid}\'' \
                .format(cid=container_id)
            dbi.update_asQuery(query)

            end_time = time.time()
            deal_time = end_time - start_time
            print(
                'task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds'
                .format(cid=container_id,
                        len=data_final.__len__(),
                        t=deal_time))
    def post(self):

        # 从客户端获取信息
        try:
            user_history=self.get_argument('user_history')
            latest_time=self.get_argument('latest_time')
            latest_timestamp=self.get_argument('latest_timestamp')
            container_id=self.get_argument('container_id')
            isDivided=self.get_argument('isDivided')
            user_history=eval(user_history)
            if isDivided==1 or isDivided=='1' :
                block_num=self.get_argument('block_num')
                current_block=self.get_argument('current_block')
            self.write('success to return user history')
            self.finish()
            print('Success: to get data from web')
        except Exception as e:
            self.write('fail to return user history')
            self.finish()
            print('Error:server-HistoryReturn:'
                  'Unable to get value from http package,Reason:')
            print(e)
            return


        # 连接
        try:
            dbi=MySQL_Interface()
        except:
            print('Error:server-HistoryReturn:'
                  'Unable to connect to MySQL')

        # 从MYSQL获取该用户相关信息
        try:
            query='select * from user_info_table where container_id=\'{cid}\''\
                .format(cid=container_id)
            user_info=dbi.select_asQuery(query)[0]
            col_name=dbi.get_col_name('user_info_table')
        except Exception as e:
            print('Error:server-HistoryReturn:'
                  'No such user in MySQL.user_info_table,Reason:')
            print(e)

        # 将数据存入Mongodb以后将相关信息存入mysql,并将isGettingBlog字段设为空
        try:
            blog_len=user_history.__len__()
            wanted_blog_len=user_info[col_name.index('blog_num')]
            blog_accuracy=blog_len/wanted_blog_len
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            if not user_info[col_name.index('update_time')]:
                save_data_inMongo(user_history)
                # query='update user_info_table set ' \
                #       'update_time=\'{up_time}\',' \
                #       'latest_blog=\'{latest_blog}\',' \
                #       'isGettingBlog=null ' \
                #       'where container_id=\'{cid}\';'\
                #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                query='update user_info_table set ' \
                      'update_time=\'{up_time}\',' \
                      'latest_blog=\'{latest_blog}\'' \
                      'where container_id=\'{cid}\';' \
                    .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                dbi.update_asQuery(query)
            else:
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\''\
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            query='insert into accuracy_table values ({acc},\'{t_s}\') ;'\
                .format(acc=blog_accuracy,t_s=time_stick)
            dbi.insert_asQuery(query)

            print('Success: insert user into MongoDB, the num of data is {len}'
                  .format(len=blog_len))
        except Exception as e:
            print('Error:server-HistoryReturn:'
                  'Unable to update data in MySQL.user_info_tabe,Reason:')
            print(e)
    def run(self):
        while True:
            start_time = time.time()
            dbi=MySQL_Interface()
            col_info=dbi.get_col_name('cache_history')
            query='select * from cache_history where is_dealing is null order by checkin_timestamp limit 1'

            mysql_res=dbi.select_asQuery(query)
            if mysql_res.__len__()==0:       # cache_history表为空时,睡眠1秒,跳过此次循环
                time.sleep(1)
                continue

            mysql_res=mysql_res[0]

            # todo for delete-----
            print('debug->start to deal with a new task')
            print('debug->mysql_res: ')
            print(mysql_res)
            #------------------------

            container_id=mysql_res[col_info.index('container_id')]
            print('debug->container_id: {cid}'.format(cid=container_id))
            latest_time=mysql_res[col_info.index('latest_time')]
            latest_timestamp=mysql_res[col_info.index('latest_timestamp')]
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(time=time_stick, cid = container_id)
            # todo for delete-----
            print('debug->query1 : {q}'.format(q=query))
            # ------------------------
            dbi.update_asQuery(query)

            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            assemble_table = db.assemble_factory
            res = assemble_table.find({'container_id': container_id}, {'current_id': 1, 'total_num': 1})
            id_list = [x['current_id'] for x in res]
            num = int([x['total_num'] for x in assemble_table.find({'container_id': container_id}).limit(1)][0])
            ## todo for delete-----
            print('debug->id_list_len: {len}'.format(len=id_list.__len__()))
            print('debug->num: {n}'.format(n=num))
            # ------------------------
            # 检查是否所有包裹已经到齐
            check_state = True
            if id_list.__len__() < num:
                print('server->HistoryReport:The package is not complete, retry to catch data')
                check_state = False

            if check_state:
                # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb
                # 将装配车间中的相关数据删除
                # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog

                # 从mysql获取该用户信息
                try:
                    query = 'select * from user_info_table where container_id=\'{cid}\'' \
                        .format(cid=container_id)
                    user_info = dbi.select_asQuery(query)[0]
                    # todo fro debug-------------
                    print('task {cid} :debug->query2: {q}'.format(q=query,cid=container_id))
                    print('task {cid} debug->user_info:'.format(cid = container_id))
                    print(user_info)
                    # --------------------------------
                    col_name = dbi.get_col_name('user_info_table')
                except Exception as e:
                    print('task {cid} :Error:server-HistoryReturn:'
                          'No such user in MySQL.user_info_table,Reason:'.format(cid = container_id))
                    print(e)

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find({'container_id':container_id}, {'data': 1 , 'current_id': 1})
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    # todo fro debug-------------
                    print('task {cid} debug->datalist: {len}'.format(len = data_list.__len__(),cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print('Error:server-HistoryReturn:'
                        'Unable to get data from MongoDB, assemble factory,Reason:')
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if  id_list.__len__() > num :
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()) :
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                            # print('data_list.len :{len}'.format(len=data_list.__len__()))
                            # print('id_list.len :{len}'.format(len=id_list.__len__()))
                            # print(i)
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final = []
                    for i in data_list:
                        data_final = data_final+i
                    # todo fro debug-------------
                    print('task {cid} debug->数据拼接完毕,len {len}'.format(len=data_final.__len__(),cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print('Error:server-HistoryReport:'
                          'Unable to contact the pieces of information,Reason:')
                    print(e)

                # 将本次信息录入accuracy_table 用以进一步分析
                blog_len = data_final.__len__()
                wanted_blog_len = user_info[col_name.index('blog_num')]
                blog_accuracy = blog_len/wanted_blog_len
                time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \
                    .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len)
                dbi.insert_asQuery(query)

                # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容
                try:
                    if not user_info[col_name.index('update_time')]:
                        # 将数据存入 Mongodb 的formal collection
                        save_data_seperately(data_final)
                        print('task {cid} Success: Data has saved in Mongodb, size is {size}'
                              .format(size=sys.getsizeof(data_final),cid=container_id))

                        # # 将关键信息录入Mydql
                        query = 'update user_info_table set ' \
                              'update_time=\'{up_time}\',' \
                              'latest_blog=\'{latest_blog}\',' \
                              'isGettingBlog=null ' \
                              'where container_id=\'{cid}\';'\
                            .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        # query='update user_info_table set ' \
                        #       'update_time=\'{up_time}\',' \
                        #       'latest_blog=\'{latest_blog}\'' \
                        #       'where container_id=\'{cid}\';' \
                        #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的
                        dbi.update_asQuery(query)
                        print('task {cid} Success: insert user into MongoDB, the num of data is {len}'
                              .format(len=blog_len,cid=container_id))
                    else:
                        query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                            .format(cid=container_id)
                        dbi.update_asQuery(query)

                except Exception as e:
                    print('task {cid} Error:server->HistoryReport:'
                          'Reason:'.format(cid=container_id))
                    print(e)
            else:
                # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除
                print('task {cid} :Error: the package is not complete ,{a} of {b}'
                      .format(a=id_list.__len__(),b=num,cid=container_id))
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            # 将数据从assemble factory 去掉
            assemble_table.remove({'container_id':container_id})
            print('task {cid} Success: Data has been removed from assemble factory'
                    .format(cid=container_id))

            # 将cache_history中的相应行删掉,表示已经处理完该事物了
            query='delete from cache_history where container_id=\'{cid}\'' \
                .format(cid=container_id)
            dbi.update_asQuery(query)

            end_time = time.time()
            deal_time = end_time - start_time
            print('task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds'
                  .format(cid = container_id, len = data_final.__len__(), t = deal_time))