Пример #1
0
    def receive_seed_and_start_crawl(self, seed):
        """接受种子
        拿出url
        带cookie
        完成请求
        """
        print('\n开始抓取')
        url = seed.get('url')
        # 一个反馈,请求一个cookie
        print('请求种子和cookie')
        self.feed_back_seed(seed, cookie=False)
        cookie = wait_for_msg_long(ssn_que_p)
        data = self.user_define_request(url, cookie)
        # 首先验证data是有有效
        if data and data != ['null']:
            # 先反馈
            self.feed_back_seed(seed, cookie=True)
            # 放数据
            seed.update({'data': data})
            # 丢入持久化队列里
            redis_cli.lpush(psm_que, dumps_json(seed))
            print('有数据,放入持久化队列')
        else:
            # 反馈该cookie失效
            seed.update({'cookie_status': 1})
            # 这里犯了一个大错,就是cookie呢  09/10
            seed.update({'cookie': cookie})
            # 丢入反馈队列里
            self.feed_back_seed(seed, cookie=True)
            print('完成反馈')

        # 他的生命循环完成
        del seed
Пример #2
0
    def lets_do_spider(self):
        """针对2.0版本"""
        result = []
        api_info = deepcopy(self.api_demo)
        # 省略了参数验证环节
        pname = self.get_argument('pname')
        cardNum = self.get_argument('cardNum')
        # 使用多进程
        pool = Pool(3)
        duty_list = ['zhixing', 'shixin', 'baidu']
        for duty in duty_list:
            result.append(pool.apply_async(run, (
                duty,
                pname,
                cardNum,
            )))

        pool.close()
        pool.join()
        # 更新被执行人
        api_info['data']['zhixing'] = result[0].get()
        # 更新失信被执行人
        api_info['data']['shixin'] = result[1].get()
        # 更新百度
        api_info['data']['baidu'] = result[2].get()

        return dumps_json(api_info)
Пример #3
0
    def lets_fuck_recycle(self, data):
        """这个函数的作用就是炒回锅肉"""
        info = []
        recycle = []

        info_file = config.info_file.get(self.name)

        url_info = config.url_info_dict.get(self.name)

        headers_info = config.headers_info
        headers_info.update(config.headers_info_dict.get(self.name))

        api = RequestAPI()

        for each in data:
            params_info = self.construct_params_info(each[-1])
            json_text = api.receive_and_request(url=url_info,
                                                headers=headers_info,
                                                params=params_info,
                                                method='GET')
            # 需要验证是否有数据
            js_dict = self.verify_json_text(json_text)
            if js_dict is not None:
                # 放回数据
                info.append(js_dict)
                # 记录数据
                write_2_file(info_file, dumps_json(js_dict))
            else:
                self.feedback()
                self.pop_captcha_info()
                recycle.append(each)

            del params_info
        return info, recycle
Пример #4
0
 def push_seed_2_queue(self, seed):
     """
     将拿到的seed放到队列里
     :return:
     """
     que = config.task_que
     redis_cli.lpush(que, dumps_json(seed))
Пример #5
0
 def put_cookie_2_que(self, cookie):
     """
     将cookie放入队列里
     """
     que = config.ssn_2_slv
     cookie = loads_json(cookie)
     redis_cli.lpush(que, dumps_json(cookie))
Пример #6
0
 def sendall(self, msg, ip, port=9000, close=True):
     try:
         sock = socket.socket()
         sock.connect((ip, port))
         sock.sendall(utils.dumps_json(msg).encode() + b'\r\n\r\n')
         if close: utils.close_connection(sock)
         return sock
     except (TimeoutError, OSError): pass
Пример #7
0
    def feed_back_seed(self, ctx, session, seed):
        """向seesion和seed管理反馈
            session=True时候,需要向session管理发送
            seed=True时候,需要向seed管理推送
        """
        if session and seed:
            redis_cli.lpush(slv_2_sed, dumps_json(ctx))
            redis_cli.lpush(slv_2_ssn, dumps_json(ctx))
        elif session and not seed:
            redis_cli.lpush(slv_2_ssn, dumps_json(ctx))
        else:
            redis_cli.lpush(slv_2_sed, dumps_json(ctx))
        return


# if __name__ == '__main__':
#     seed = {"brand_id": "1199", "brand": "奥迪", "serise_id": "2614", "serise": "奥迪A5", "p_type": "合资", "url": "https://www.guazi.com/xinyang/dealrecord?tag_id=22288&date=2017100", "check_city": "xinyang", "date": "2018-1", "cookie": {}, "data": [], "cookie_status": 0, "epoh": 0}
#     cookie = {"clueSourceCode": "%2A%2300", "preTime": "%7B%22last%22%3A1537928136%2C%22this%22%3A1537928136%2C%22pre%22%3A1537928136%7D", "GZ_TOKEN": "ef52toYlCiG36xYV8f3011%2BZVJgkcTK8eTkkn31WYGulmX9gKIByhmHZp1d6sg%2BtwJ3L0CbW2avGHetiKQLSM5EvM90l2XbOHFMZs97irvp8flsdbMTJlK1okNg8BAtx6RkhoQ%2BhbwYPAaLDLw", "guaZiUserInfo": "0MSnBkg0hdYQNXvlLOYi2", "userid": "620499844"}
#     sh = SpiderHandler('1')
#     # sh.receive_seed_and_start_crawl(seed)
#     sh.demo(seed, cookie)
Пример #8
0
    def receive_seed_and_start_crawl(self, seed):
        """接受种子
        拿出url
        带cookie
        完成请求
        """

        print('\nspider获取任务')
        url = seed.get('url')
        # 一个反馈,请求一个cookie
        print('请求种子和cookie')

        self.feed_back_seed(seed, session=True, seed=False)

        # 等待cookie
        cookie = wait_for_msg_long(ssn_2_slv)

        # 实例化请求,解析模块
        rm = RequestModel(self.mark)
        data = rm.user_define_request(url, cookie)

        # 首先验证data是有有效
        if data != ['redirect'] and data != ['null'] and data != []:
            # 先反馈, 这时候只需要向种子管理反馈
            self.feed_back_seed(seed, session=False, seed=True)
            # 放数据
            seed.update({'data': data})
            # 丢入持久化队列里
            print('有数据,放入持久化队列\n')
            redis_cli.lpush(slv_2_psm, dumps_json(seed))

        elif data == ['redirect']:
            # 反馈该cookie失效, 需要向两个队列同时反馈
            seed_b = deepcopy(seed)
            seed_b.update({'cookie_status': 1})
            # 这里犯了一个大错,就是cookie呢  09/10
            seed_b.update({'cookie': cookie})
            # 丢入反馈队列里
            self.feed_back_seed(seed_b, session=True, seed=True)
            print('cookie失效,完成反馈\n')
            del seed_b

        else:
            # 没有数据,cookie仍旧是有效的
            # 只需要向种子管理反馈
            self.feed_back_seed(seed, session=False, seed=True)
            print('没有数据, 完成反馈\n')

        # 他的生命循环完成
        del seed
        del rm
Пример #9
0
    def discover(self, localhost_only=False):
        broadcast_msg = { 'operation': 'DISCOVER', 'join': True, 'sender': list(self.node), 'key': utils.generate_random_id() }
        broadcast_msg = utils.dumps_json(broadcast_msg)
        broadcast_msg = broadcast_msg.encode()
        sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)

        if localhost_only:
            for p in range(8000, 8011):
                if p != self.node.port:
                    sock.sendto(broadcast_msg, ('127.0.0.1', p))
        else:
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
            sock.sendto(broadcast_msg, ('255.255.255.255', 8081))
Пример #10
0
 def parase_html(self, html):
     info = []
     total_num = 1
     js_dict = loads_json(html)
     if js_dict is not None:
         data = js_dict.get('data')
         if isinstance(data, list) and data != []:
             for i in data:
                 for each in i.get('result'):
                     info.append(each)
                     # 放入持久化
                     write_2_file(self.baidu_list, dumps_json(each))
                 # 还需要判断是否有下一页
                 total_num = i.get('dispNum', 1)
     return info, total_num
Пример #11
0
def insert_seed_save(url, brand, serise, city, date, brand_id, serise_id, p_type):
    """注入seed以及保存"""
    seed = deepcopy(seed_demo)
    seed.update({'url': url,
                 'brand_id': brand_id,
                 'brand': brand,
                 'serise_id': serise_id,
                 'serise': serise,
                 'check_city': city,
                 'date': date,
                 'p_type': p_type
                 })
    write_2_file(seed_file, dumps_json(seed))
    del seed
    return
Пример #12
0
def start_node_in_order(cmd):
    """
    将指定的命令放入队里
    """
    que = config.task_que
    order = {
        'ssnm': {'cmd': {'command': 'ssnm'}, 'name': 'SessionMangement'},
        'sedm': {'cmd': {'command': 'sedm'}, 'name': 'SeedMangement'},
        'psm': {'cmd': {'command': 'psm'}, 'name': 'PersistenceMangement'}
    }
    print('启动\t{0}'.format(order.get(cmd).get('name')))
    redis_cli.lpush(que, dumps_json(order.get(cmd).get('cmd')))
    # 等待回馈
    wait_feed_back()
    print('完成启动')
    return
Пример #13
0
def lets_get_cookie(cookies):
    # result 用来验证cookie 从而验证登录是否成功
    result = False
    index_list = ['userid', 'guaZiUserInfo', 'GZ_TOKEN']
    user_info = {}
    for each in cookies:
        for i in index_list:
            if each.get('name') == i:
                result = True
                user_info.update({i: each.get('value')})

    if result:
        # 保存账号
        write_2_file('DB/user_info.txt', dumps_json(user_info))
        print('该用户信息已经保存')
    else:
        print('登录失败,未能保存用户cookie信息')
Пример #14
0
 def __delete_cookie(self, user_id):
     """
     这里是删除思路就是遍历一遍cookie_list
     除开删除的用户,其余重新写入文件里
     :param user_id:待删除的 user_id
     """
     cookie_list = self.load_cookies_list()
     new_cookie_list = []
     for cookie in cookie_list:
         cookie = loads_json(cookie)
         if not cookie.get('userid') == user_id:
             new_cookie_list.append(cookie)
     # 重新写入文件
     initial_file(config.user_info_file)
     if new_cookie_list != []:
         for each in new_cookie_list:
             write_2_file(config.user_info_file, dumps_json(each))
     return
Пример #15
0
def listn_the_psm_que():
    """持续监听psm_que这个队列
    只要一有数据过来,就做存储
    """
    # 先反馈
    # 完成后像队里推送一条已完成启动
    print('持久化队列启动')
    que = config.task_que_fb
    ctx = dumps_json({'psm': 'done'})
    redis_cli.lpush(que, ctx)
    while True:
        if redis_cli.exists(psm_que):
            # 就开始处理
            token_set = make_set(token, blank='', index='')
            msg = redis_cli.rpop(psm_que)
            seed = loads_json(translate_2_json_dict(msg))
            print('{0}\t收到数据'.format(
                datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')))
            # 接下来就是做持久化处理了
            do_persistence(seed, token_set)
        time.sleep(0.1)
Пример #16
0
    def seed_main_logic(self):
        """
        主要处理逻辑
        1. 生产种子
        2. 提取种子
        3. 检测状态
        """
        print('seed管理已启动')
        # 完成后像队里推送一条已完成启动
        que = config.task_que_fb
        ctx = dumps_json({'sedm': 'done'})
        redis_cli.lpush(que, ctx)

        # 更新车系
        update_brands_serise()
        # 第一步就是生产种子
        print('生产种子')
        self.seeds_maker()
        # 完了后先丢20个种子
        self.decide_push_seed_2_queue(0)

        # 开始监听队列,准备投放种子
        slv_2_sed = config.slv_2_sed
        while True:
            msg = wait_for_msg_long(slv_2_sed)
            print('{0}\t接收反馈'.format(
                datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')))
            if msg:
                # 开始处理这个反馈
                # 主要看 cookie_status
                is_deal = self.deal_feed_back(msg)
                if is_deal:
                    msg.update({'cookie_status': 0})
                    self.push_seed_2_queue(msg)
                    continue
                else:
                    # 通过的话,则上传一个新的种子
                    self.decide_push_seed_2_queue(1)
                print('完成推送')
            time.sleep(0.1)
Пример #17
0
    def session_main_logic(self):
        """
        由slave调用的部分
        实例化后,实现登录
        在 删除,导入列表时候通过消息通信来完成
        需要加一个结束模块

        #09-05 解决bug
        等待机制:
            当收到第一个请求触发
            统计数量
            放入队列
        """
        print('session管理已启动')
        # 实例化我们的种子模块,并开始登录
        self.logic_add_cookie()
        # # 并把所有的cookie都扔到消息队列里去
        # self.decide_psuh_cookie_2_que(0)

        # 完成后像队里推送一条已完成启动
        que = config.task_que_fb
        ctx = dumps_json({'ssnm': 'done'})
        redis_cli.lpush(que, ctx)
        # 开始监听反馈队列
        print('开始监听ssn_req队列')
        slv_2_ssn = config.slv_2_ssn
        while True:
            msg_list = []
            msg = wait_for_msg_long(slv_2_ssn)
            print(datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
                  '\t接受反馈')
            msg_list.append(msg)
            # 只要有消息来了,先处理,再就发一条cookie出去
            is_deal = self.deal_feed_back(msg_list)
            if not is_deal:
                self.decide_psuh_cookie_2_que(1)
            print('完成cookie派发\n')
            """
Пример #18
0
        def attend(client):
            while True:
                msg = self.recvall(client)

                if not msg:
                    break

                # print(f'RECEIVED MSG {msg}')
                data = {'method': None}
                try:
                    data = json.loads(msg)
                except:
                    pass

                if data['method'] == 'PUBLISH':
                    node = self.node.asTuple()
                    self.publish(data, node, node)
                elif data['method'] == 'LOOKUP':
                    answer = self.lookup_value(data['id'])

                    founded, result, file_bytes = answer[0], answer[1], answer[2]
                    if founded and result['value_type'] == 'file':

                        if not file_bytes: file_bytes = self.recv_file()
                        # try: client.sendall(file_bytes)
                        # except: pass
                        client.sendall(file_bytes)
                    else:
                        if not founded: result = None
                        client.sendall(utils.dumps_json(result).encode() + b'\r\n\r\n')
                    client.close()

                elif data['method'] == 'PING':
                    client.send(b'PING')
                elif data['method'] == 'STORE':
                    key, value = data['store_key'], data['store_value']
                    publisher, sender = tuple(data['publisher']), tuple(data['sender'])

                    real_value = None
                    if data['value_type'] == 'file':
                        real_value = self.recv_file()

                    self.node.STORE(key, value, publisher, sender, data['value_type'], real_value, data['to_update'])

                elif data['method'] == 'FIND_VALUE':
                    founded, result, file_bytes = self.node.FIND_VALUE(data['id'])
                    answer = {'operation': 'RESPONSE', 'result': (founded, result, file_bytes),
                                'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] }

                    # client.sendall(utils.dumps_json(answer).encode() + b'\r\n\r\n')
                    answer = utils.dumps_json(answer)
                    client.sendall(answer.encode() + b'\r\n\r\n')
                    if founded and result['value_type'] == 'file':
                        # files_bytes = utils.load_file(result['value'])
                        self._send_file(file_bytes, data['sender'][1])

                    if not Node.Equals(data['sender'], self.node):
                        self.update(data['sender'])

                elif data['method'] == 'UPDATE':
                    self._update(data['store_key'], data['store_value'], data['publisher'], data['sender'])

                elif data['method'] == 'FIND_NODE':
                    result = self.node.FIND_NODE(data['id'])
                    answer = {'operation': 'RESPONSE', 'result': result,
                                'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] }

                    # client.sendall(utils.dumps_json(answer).encode() + b'\r\n\r\n')
                    answer = utils.dumps_json(answer)
                    client.sendall(answer.encode() + b'\r\n\r\n')

                    if not Node.Equals(data['sender'], self.node):
                        self.update(data['sender'])
            exit_thread()
Пример #19
0
def download_img_and_ocr(type):
    """同样的去请求,然后拿到数据
    需要去重写 requestAPI的 do_request
    下载的过程,将 zhixing 和 shixin文件里的captchaId都下载
    给个开关

    # 2018-10-29 在每次执行最后,将文件清空
    """
    is_go_on = False

    headers = config.headers_CaptchaHandler
    captcha_list = []
    if type == 'zhixing':
        captcha = config.file_captcha_zhixing
        headers.update({'Host': config.host_zs.get('zhixing')})

    else:
        captcha = config.file_captcha_shixin
        headers.update({'Host': config.host_zs.get('shixin')})
    for i in open(captcha, 'r', encoding='utf8'):
        url = config.url_captcha_zhixing if type == 'zhixing' else config.url_captcha_shixin
        params = config.params_captcha
        params.update({'captchaId': i.strip(), 'random': random.random()})
        # 开始请求

        logger.info('下载验证码图片\t{0}\t{1}'.format(type, i.strip()))
        di = Download_img()
        img = di.receive_and_request(url=url,
                                     headers=headers,
                                     params=params,
                                     method='GET')
        if img != 'null_html':
            """
            # 保存图片
            file_path = config.img_file.format(captcha)
            save_img(file_path=file_path, img=img)
            """
            # 执行ocr
            url = config.url_svm
            img_d = base64.b64encode(img)
            payloads = {'pic': img_d, 'type': 'pste'}
            result = di.receive_and_request(url=url,
                                            payloads=payloads,
                                            method='POST')
            try:
                result_dict = loads_json(result)
                if result_dict.get('status_code') == 200:

                    captcha_list.append(
                        [i.strip(),
                         result_dict.get('data').get('captcha')])
                    logger.info('完成图片ocr\t{0}\t{1}'.format(type, i.strip()))
                    is_go_on = True

            except Exception as e:
                logger.warning('ocr识别失败\t{0}'.format(e))

        else:
            logger.warning('下载验证码图片失败\t{0}\t{1}'.format(type, i.strip()))

    # 丢入队列里
    # 先要加入一个判断,列表不为空则行

    if captcha_list != []:
        que = config.que.get(type)
        logger.debug(captcha_list)
        push_2_que(que, dumps_json(captcha_list))
        logger.debug('ocr结果推入队列')

    return is_go_on
Пример #20
0
 def feed_back_seed(self, seed, cookie):
     """向seesion和seed管理反馈"""
     redis_cli.lpush(ssn_que, dumps_json(seed))
     if cookie:
         redis_cli.lpush(sed_que, dumps_json(seed))
Пример #21
0
    def proccess_message(self, data, addr, running_in_thread=False):
        # data = json.loads(msg)
        if data['operation'] != 'DISCOVER':
            print("Data received: " + str(data))

        if data['operation'] == 'DISCOVER':
            if data['join']:
                # addr = str(data['sender'][1]), int(data['sender'][2])
                if addr != (self.node.ip, self.node.port):
                    answer = { 'operation': 'CONTACT', 'sender': list(self.node),
                            'key': data['key'] }

                    self.send_udp_msg(json.dumps(answer).encode(), addr)
                    self.update(tuple(data['sender']))
                    print(f"{data['sender']} joined")

            else:
                if addr != (self.node.ip, self.node.port):
                    ip, port = str(data['ip']), int(data['port'])
                    server_addr = (self.node.ip, self.tcp_server_port)
                    try: self.sendall(server_addr, ip, port)
                    except: pass

        elif data['operation'] == 'CONTACT':
            contact = tuple(data['sender'])
            self.update(contact)
            self.lookup_node(self.node.ID)

        # A peer has to perform a method specified by other peer by RPC
        elif data['operation'] == 'EXECUTE':
            result = None
            if data['method'] == 'FIND_NODE':
                result =  self.node.FIND_NODE(data['id'])
            elif data['method'] == 'FIND_VALUE':
                result = self.node.FIND_VALUE(data['id'])
                answer = {'operation': 'RESPONSE', 'result': result,
                            'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] }
                self.sendall(answer, addr[1])
                if 'sender' in data: self.update(tuple(data['sender']))
                return

            elif data['method'] == 'PING':
                result = self.node.PING()
            elif data['method'] == 'STORE':
                key, value = data['store_key'], data['store_value']
                publisher, sender = tuple(data['publisher']), tuple(data['sender'])
                result = self.node.STORE(key, value, publisher, sender, to_update=data['to_update'])
            elif data['method'] == 'LOOKUP':
                result = self.lookup_value(data["id"])
            elif data['method'] == 'PUBLISH':
                node = self.node.asTuple()
                self.publish(data, node, node)

            if result is not None:
                answer = {'operation': 'RESPONSE', 'result': result,
                            'key': data['key'], 'sender': [self.node.ID, self.node.ip, self.node.port] }
                answer = utils.dumps_json(answer).encode()
                self.send_udp_msg(answer, addr)


                if 'sender' in data:
                    self.update(tuple(data['sender']))

        # A peer is requested to perform a RPC to other peer
        elif data['operation'] == 'RPC':
            msg = None

            if data['method'] == 'FIND_NODE':
                msg =  utils.build_FIND_NODE_msg(data['id'], self.node)
            elif data['method'] == 'FIND_VALUE':
                msg =  utils.build_FIND_VALUE_msg(data['id'], self.node)
            elif data['method'] == 'PING':
                msg =  utils.build_PING_msg(self.node)
            elif data['method'] == 'STORE':
                msg = utils.build_STORE_msg(data['storeKey'], data['store_value'], self.node, self.node)

            if msg is not None:
                # The address of the remote peer wich it will be used as the target of the RPC
                addr = (data['ip'], data['port'])
                msg = utils.dumps_json(msg).encode()
                self.send_udp_msg(msg, addr)


        # The peer receives the answer of a RPC made before
        elif data['operation'] == 'RESPONSE':
            self.set_response(data['key'], data)
            if not Node.Equals(data['sender'], self.node):
                self.update(data['sender'])
        if running_in_thread:
            exit_thread()