Exemplos de Util em Python, exemplos de packages.Util em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: rabbitmq.py Projeto: LiBin-Chen/spider_demo

 def put_queue_list(self,
                    queue_name=None,
                    message_list=None,
                    print_info=True,
                    exchange=''):
     '''提交异常至队列列表'''
     if not queue_name and not exchange:
         return None
     try:
         if not message_list:
             return None
         if isinstance(message_list, dict):
             message_list = [message_list]
         self.declare(queue_name, exchange=exchange)
         for message in message_list:
             if print_info:
                 if 'abbreviation' in message:
                     print('abbreviation : %s 数据已提交至队列 %s' %
                           (Util.binary_type(
                               message['goods_sn']), queue_name))
                 elif 'cp_id' in message:
                     print('ID : %s 数据已提交至队列 %s' %
                           (Util.binary_type(message['cp_id']), queue_name))
             message = json.dumps(message)
             self.channel.basic_publish(
                 exchange=exchange,
                 routing_key=queue_name,
                 body=message,
                 properties=pika.BasicProperties(
                     delivery_mode=2,  # 持久化
                 ))
         self.close()
     except Exception as e:
         print(e)
         return None

Exemplo n.º 2

0

Exibir arquivo

def main(**kwargs):
    sd = kwargs.get('sd', '')
    ed = kwargs.get('ed', '')
    interval = kwargs.get('interval', 60)
    date_list = util.specified_date(sd, ed)

    data = [{'url': '1'}, {'url': '2'}]
    while 1:
        proxy = util.get_prolist(10)
        for _data in data:
            url = _data.get('url', '')
            if not url:
                continue
            fetch_data(url=url, proxy=proxy, headers=default_headers, **kwargs)

            '''
            #根据url规律进行控制
            '''
            for str_time in date_list:
                pass

        if not interval:
            break
        print('-------------- sleep %s sec -------------' % interval)
        time.sleep(interval)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: update.py Projeto: Gzigithub/workspace

def parse_more(item=None, response=None):
    if not item or not response:
        return -404
    root = lxml.html.fromstring(response.text.encode('utf-8'))
    data = {}
    # family_sn
    match = family_sn_pattern.search(response.url)
    data['family_sn'] = match.group(1) if match else item['goods_name']
    # catlog
    breadcrumb = root.xpath('//p[@class="breadcrumb"]/a')
    data['catlog'] = []
    for catlog in breadcrumb:
        catlog_name = util.cleartext(catlog.text_content())
        catlog_url = util.urljoin(response.url, catlog.xpath('./@href')[0])
        if catlog_name and catlog_url:
            data['catlog'].append([catlog_name, catlog_url])
        else:
            data['catlog'] = []
            break
    else:
        data['catlog'].append([data['family_sn'], response.url])
    # doc
    doc = root.xpath('//li[@class="pdf"]/a[@class="doclink"]/@title')
    data['doc'] = "http://cds.linear.com/docs/en/datasheet/{title}".format(
        title=doc[0]) if doc else ''

    item.update(data)
    return item

Exemplo n.º 4

0

Exibir arquivo

    def process_item(self, item, spider):
        """保存数据"""
        demo = item.get('demo')
        if not demo:
            raise DropItem("item data type error")
        # self.put_queue(item)
        data = copy.deepcopy(dict(item))
        if not data:
            raise DropItem("item data is empty")

        # info = self.mongo.find_one({'demo': data['demo']})
        demo_test = item.get('demo_test', '')
        if not demo_test:
            raise DropItem("demo_test is empty")
            # return
        condition = {'demo': demo}
        try:
            info = self.mysql.select(demo_test, condition=condition, limit=1)
            if not info:
                # self.mongo.insert(data)
                item['create_time'] = util.date()
                item['update_time'] = util.date()
                self.mysql.insert(demo_test, data=item)
                # _logger.info('success insert mysql : %s' % data['demo'])
            else:
                item['create_time'] = info['create_time']
                item['update_time'] = util.date()
                # self.mongo.update({'_id': info['_id']}, {"$set": data})
                self.mysql.update(demo_test, condition=condition, data=item)
                # _logger.info('success update mysql : %s' % data['demo'])
        except Exception as e:
            _logger.info('error op mysql : {0}  : e {1}'.format(
                data['demo'], e))
        raise DropItem('success process')

Exemplo n.º 5

0

Exibir arquivo

Arquivo: update.py Projeto: Gzigithub/workspace

def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
    """
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400

    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    return _parse_detail_data(resp, headers=_headers, **kwargs)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: crawl.py Projeto: tctxl/practice

 def parse_price(self, resp):
     """解析库存价格数据"""
     items = resp.meta.get('items')
     if not items:
         logger.error('request meta data error, url: %s', resp.url)
         return
     prices = {}
     try:
         data = json.loads(resp.body)
         for entprice in data['EntitledPrice']:
             tiered = []
             if 'RangePrice' not in entprice:
                 entprice['RangePrice'] = []
             for vo in entprice['RangePrice']:
                 qty = util.intval(vo['minimumQuantity']['value']) if 'minimumQuantity' in vo else 1
                 price = util.floatval(vo['priceInRange']['value']) if 'priceInRange' in vo else 0
                 if not qty or (tiered and qty < tiered[-1][0]):
                     continue
                 tiered.append([qty, price])
             if not tiered:
                 tiered.append([0, 0.0])
             prices[entprice['productId']] = tiered
     except:
         logger.exception('parse stock price error, url: {0}---price_Json_error---{1}'.format(resp.url, resp.body) )
     for item in items:
         if item['goods_sn'] in prices:
             item['tiered'] = prices[item['goods_sn']]
         yield item

Exemplo n.º 7

0

Exibir arquivo

    def fetch_update_data(self, data_list=[], proxy=None, **kwargs):
        '''获取更新数据

        @return
            无论请求data_list
                0       为空（无视）
                -401      错误（需要重试，程序出错，语法或者由于异常删除造成错误，需要检查程序）
                -402      数据异常（需要重试，需要检验数据获取情况）
                -400    代理异常（须重试，可以无视）
                -200    非200状态，代理异常或者数据异常（须重试，特别注意此种情况是否进入死循环）
                200     正常状态，并非指http状态码
                404     产品不存在已被删除

        '''
        # 根据url进行网站判断, 进而调用网站爬虫的模块

        update_url = kwargs.get('update_url', '')
        if not update_url:
            return
        if '360' in update_url:
            return
        supplier_name = update_url.split('.')[1]
        if supplier_name is None:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if 'fetch_update_data' in dir(obj):
                _fetch_update_data = getattr(obj, 'fetch_update_data')
            else:
                kwargs['status'] = -401
                data_list.append(kwargs)
                return None
        except Exception as e:
            config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e))
            kwargs['status'] = -401
            data_list.append(kwargs)
            return None
        try:
            kwargs['headers'] = headers
            kwargs['proxy'] = proxy
            data_list.append(_fetch_update_data(**kwargs))
        except Exception as e:
            kwargs['status'] = -402
            if 'headers' in kwargs:
                del kwargs['headers']
            if 'proxy' in kwargs:
                del kwargs['proxy']
            data_list.append(kwargs)
            config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s',
                                 {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: crawl.py Projeto: tctxl/practice

 def parse_detail(self, data, category=None):
     """解析系列型号数据"""
     if category is None:
         category = {}
     item = GoodsItem()
     item['url'] = urlparse.urljoin(self.base_url, data['avn_pdp_seo_path'])
     item['goods_sn'] = data['uniqueID']
     item['goods_name'] = data['mfPartNumber_ntk'].upper()
     if not item['goods_name']:
         return None
     if 'packageTypeCode' in item:
         item['goods_other_name'] = '{0}/{1}'.format(item['goods_name'], item['packageTypeCode']).upper()
     item['provider_name'] = data['manufacturer']
     item['provider_url'] = ''
     item['goods_desc'] = data['shortDescription'] if 'shortDescription' in data else ''
     if 'avn_thumbnail' in data and data['avn_thumbnail']:
         item['goods_thumb'] = util.urljoin(self.base_url, data['avn_thumbnail'])
     else:
         item['goods_thumb'] = ''
     item['goods_img'] = item['goods_thumb'].replace('icon_thumb', 'icon_web')
     if 'auxDescription2' in data and data['auxDescription2']:
         item['doc'] = data['auxDescription2']
     else:
         item['doc'] = ''
     min_qty = int(data['xcatField1']) if 'xcatField1' in data else 1
     if 'multQuantity' in data:
         increment = int(data['multQuantity'])
     else:
         increment = 1
     if 'inv_strlocqty' in data:
         stock_qty = util.intval(data['inv_strlocqty'])
     else:
         stock_qty = 0
     item['rohs'] = 1 if 'ROHSComplianceCode' in data and data['ROHSComplianceCode'] == 'Y' else 0
     item['tiered'] = [[0, 0.0]]
     item['stock'] = [stock_qty, min_qty]  # 库存
     item['increment'] = increment
     # 属性
     item['attr'] = []
     if 'attributes' not in data:
         data['attributes'] = []
     for vo in data['attributes']:
         try:
             item['attr'].append([vo['name'], vo['values'][0]['value']])
         except:
             pass
     # 分类
     item['catlog'] = []
     catelogs = data['parentCatgroup_id_path'].split('_')[-1].split(':')
     for vo in catelogs:
         if vo not in category:
             continue
         item['catlog'].append((category[vo], vo))
     item['region'] = 'AMERICAS'
     item['id'] = 16
     return item

Exemplo n.º 9

0

Exibir arquivo

Arquivo: database.py Projeto: LiBin-Chen/spider_demo

 def flush(self, table_name):
     '''
     刷新字段数据
     '''
     fields = self.get_fields(table_name)
     if not fields:
         return False
     self.fields[table_name] = fields
     if self.db_fields_cache:
         Util.file(
             '.fields/%s/%s_%s_%s' %
             (self.host, self.__class__.__name__, self.dbname, table_name),
             fields)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: cooperation_template.py Projeto: Gzigithub/workspace

def exception_notice(etype=''):
    """异常通知"""
    now_minuter = util.date(format='%Y-%m-%d %H:%M')
    subject = '【HQChip】合作库存 %s 数据更新异常通知 %s' % (PN2, now_minuter)
    if etype == 'mysql':
        except_msg = 'mysql数据库连接异常'
    elif etype == 'mongo':
        except_msg = 'mongodb 数据库连接异常'
    else:
        except_msg = '数据获取异常'
    body = "合作库存 %s 数据更新数据获取异常, 异常原因：%s,请注意检查！" % (PN2, except_msg)
    util.sendmail(config.EMAIL_NOTICE.get(
        'accept_list'), subject=subject, body=body)

Exemplo n.º 11

0

Exibir arquivo

def fetch_data(url, proxy=None, headers=None, **kwargs):
    """获取页面数据

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
        :param url:


    """
    if 'goods_sn' in kwargs:
        del kwargs['goods_sn']
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    if url[0:2] == '//':
        url = 'http:' + url
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        ti_domain = urlparse.urlsplit(url)[1]
        if 'www.ti.com.cn' == ti_domain:
            product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE)
            product_path = product_path_pattern.search(url)
            if product_path:
                url = "http://www.ti.com/product/{path}".format(
                    path=product_path.group(1))
        elif 'store.ti.com' in ti_domain:
            kwargs['proxies'] = proxies
            return _parse_store_ti_com(url, **kwargs)
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        return -400
    # 是否需要添加500的判断
    # 强制utf-8
    resp.encoding = 'utf-8'
    if '404.html' in resp.url:
        return 404
    if '/tool/' in resp.url:
        return _parse_tool_detail(resp, **kwargs)
    kwargs['proxies'] = proxies
    return _parse_detail_data(resp, headers=_headers, **kwargs)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: login.py Projeto: LiBin-Chen/spider_demo

    def load_js(self):
        '''
        加载js文件
        :return:
        '''
        file_path = util.get_static_file(self.js_file)
        #
        try:
            with open(file_path, 'r', encoding='utf-8') as fp:
                js_str = fp.read()
        except Exception as e:
            _logger.info('INFO: 加载js文件错误 {0}'.format(util.traceback_info(e)))
            js_str = ''

        return js_str

Exemplo n.º 13

0

Exibir arquivo

    def parse_resp(self, resp):
        '''
        第一层处理,类别获取后进行下一层抓取
        :param resp: 
        :return: 
        '''
        item = GoodsItem()
        category = []
        date_list = util.specified_date(self.start_date,
                                        end_date=self.end_date)
        for category_url in category:
            if self.abbreviation and self.abbreviation not in category_url:
                # 非指定的数据不进行抓取(指定彩种的情况下使用该选项)
                continue
            '''
            抓取规则
            '''
            today_url = ''
            # 获取保存的数据库
            result_key = category_url.split('-')[1]
            demo_test = config.PKS_KEY_DICT.get(result_key, '')

            for history_date in date_list:
                date_time = ''.join(history_date.split('-'))
                url = today_url.replace('today', date_time)
                yield scrapy.Request(url=url,
                                     headers=self.headers,
                                     callback=self.parse_product,
                                     meta={'item': item})

Exemplo n.º 14

0

Exibir arquivo

def save_data(url, db_name, item):
    '''
    数据保存
    '''
    info = None
    if not info:
        item['create_time'] = util.date()
        mysql.insert(db_name, data=item)
        _logger.info('INFO:  DB:%s 数据保存成功, 期号%s ; URL:%s' % (db_name, item['demo'], url))

    else:
        item['update_time'] = util.date()
        del item['open_time']
        del item['create_time']
        mysql.update(db_name, condition=[('demo', '=', item['demo'])], data=item)
        _logger.info('INFO:  DB:%s 数据已存在 更新成功, 期号: %s ; URL:%s' % (db_name, item['demo'], url))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: update.py Projeto: Gzigithub/workspace

def handle_of_redirects(item=None):
    item = item if item else {}
    if not item:
        return -404
    search_url = 'http://www.linear.com.cn/search/index.php?q={search}'.format(
        search=item['goods_name'])
    _headers = copy.copy(default_headers)
    _headers.update({'Host': 'www.linear.com.cn'})
    resp = requests.get(url=search_url,
                        headers=_headers,
                        allow_redirects=False)
    location = util.urljoin(resp.url, resp.headers.get('Location'))
    if 'product/' in location or 'solutions/' in location:
        try:
            response = requests.get(url=location, headers=_headers)
        except:
            logger.error("获取目录和文档失败 URL{url}".format(url=location))
            return -404
        return parse_more(item, response)
    elif 'search.php' in location:
        try:
            response = requests.get(url=location, headers=_headers)
        except:
            logger.error("获取搜索列表 URL{url}".format(url=location))
            return -404
        return filter_search_result(item, response)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: bak.py Projeto: Gzigithub/workspace

 def parse_resp(self, resp):
     search_match = self.product_url_pattern_0.search(
         urllib.unquote(resp.url))
     detail_match = self.product_url_pattern_1.search(
         urllib.unquote(resp.url)) or self.product_url_pattern_2.search(
             urllib.unquote(resp.url))
     print "=" * 30
     print resp.url
     print urllib.unquote(resp.url)
     print detail_match
     print search_match
     if detail_match:
         yield self.parse_detail(resp)
     elif search_match:
         soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml')
         # 获取搜索数目
         try:
             total = soup.find('h3', class_='results')
             total = util.intval(total.get_text(strip=True)) if total else 0
         except:
             total = 0
         pages = int(math.ceil(total / self.limit_num))
         if pages <= 1:
             return
         for x in range(1, pages + 1):
             page_url = "http://cn.futureelectronics.com/zh/search.aspx?dsNav=Ro:%d,Aro:%d" % (
                 x * 10, x * 10)
             search_id = search_match.group(1)
             page_url = page_url + ',N:{search_id}'.format(
                 search_id=search_id)
             yield Request(url=page_url,
                           headers=self.headers,
                           cookies=self.cookies)

Exemplo n.º 17

0

Exibir arquivo

 def process_request(self, request, spider):
     scheme, url, port = util.get_host(request.url)
     try:
         proxies = get_proxies(settings['USE_PROXY'])
     except:
         raise NotConfigured
     request.meta["proxy"] = proxies[scheme]

Exemplo n.º 18

0

Exibir arquivo

    def write_update_info(self, num_list):
        '''记录更新信息

        @param num_list     记录每次更新数目信息
        @param name         记录类型值，默认count为成功值
        '''
        if not num_list:
            return None
        mq.put('crawler_update_stats', {'data': num_list, 'time': util.date()})

Exemplo n.º 19

0

Exibir arquivo

    def _init_args(self, **kwargs):
        start_url = kwargs.get('START_URL', '')
        self.abbreviation = kwargs.get('ABBREVIATION', '')
        self.start_date = kwargs.get('START_DATE', '')
        self.end_date = kwargs.get('END_DATE', '')
        self.end_date = self.end_date if self.end_date else util.date()

        if start_url:
            self.start_urls = [start_url]
        self.rules = (Rule(LinkExtractor(allow=filter_rules),
                           callback='parse_resp',
                           follow=True), )

Exemplo n.º 20

0

Exibir arquivo

Arquivo: database.py Projeto: LiBin-Chen/spider_demo

 def _check_table_info(self, table_name):
     # 只在第一次执行记录
     if table_name not in self.fields:
         # 如果数据表字段没有定义则自动获取
         if self.db_fields_cache:
             self.fields[table_name] = Util.file(
                 '.fields/%s/%s_%s_%s' %
                 (self.host, self.__class__.__name__, self.dbname,
                  table_name))
             if not self.fields[table_name]:
                 self.flush(table_name)
         else:
             # 每次都会读取数据表信息
             self.flush(table_name)

Exemplo n.º 21

0

Exibir arquivo

def get_proxies(proxies_type=1):
    '''
    返回指定代理 | 每次更新20个代理
    :param proxies_type: int 代理類型
    :return: proxies_dict
    '''
    if queue.qsize() > 0:
        return queue.get()
    if proxies_type == 1:
        proxies = util.get_abuyun_proxies()
        for i in range(20):
            queue.put(proxies)
    else:
        get_web_proxy()
    return queue.get()

Exemplo n.º 22

0

Exibir arquivo

Arquivo: put_queue.py Projeto: LiBin-Chen/spider_demo

def run(args):
    if not isinstance(args, argparse.Namespace):
        print('参数有误')
        return
    interval = args.interval
    while 1:
        try:
            PutQueue(**args.__dict__)
            if args.interval <= 0:
                break
            print('------------- sleep %s sec -------------' % interval)
            time.sleep(interval)
        except Exception as e:
            if 'params_error' in e:
                break
            print(util.traceback_info(e, return_all=True))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: cooperation_template.py Projeto: Gzigithub/workspace

def get_time_desc(t):
    """
    获取时间描述
    :param t:
    :return:
    """
    _time_desc = ''
    h = int(t / 3600)
    if h >= 1:
        _time_desc += '%s 小时' % h
    m = int((t - h * 3600) / 60)
    if m >= 1:
        _time_desc += '%s 分' % m
    s = util.number_format(t - h * 3600 - m * 60, 3)
    if s >= 0:
        _time_desc += '%s 秒' % s
    return _time_desc

Exemplo n.º 24

0

Exibir arquivo

def fetch_data(url, proxy=None, headers=None, **kwargs):
    '''
    获取页面数据
    @description

    @param proxy    代理ip，[代理数量,代理列表]
    @param headers  头部信息，如user_agent
    @param kwargs   扩展参数，如fetch_update其表示是否为获取更新


    @return
        获取数据异常时返回信息为负值，成功为字典类型数据
    '''

    if isinstance(headers, dict):
        default_headers = headers
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}

        sess = requests.Session()
        rs = sess.get(url, headers=default_headers, cookies=_cookies, timeout=30, proxies=proxies)
    except Exception as e:
        # 将进行重试，可忽略
        _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url))
        return -400

    if rs.status_code != 200:
        if rs.status_code == 500:
            _logger.debug('STATUS:-500 ; INFO:请求被禁止 ; PROXY：%s ; URL:%s ; User-Agent:%s' % (
                proxies['http'] if proxy else '', url, headers.get('user_agent', '')))
            return -500
        # 已失效产品（url不存在）
        elif rs.status_code == 404:
            _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % url)
            return 404
        _logger.debug('STATUS:-405 ; INFO:请求错误，网页响应码 %s ; PROXY：%s ; URL:%s' % (
            rs.status_code, proxies['http'] if proxy else '', url))
        return -405
    # 强制utf-8
    rs.encoding = 'utf-8'

    return _parse_detail_data(rs.text, url=url, **kwargs)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: site_tti_spider.py Projeto: Gzigithub/workspace

 def parse(self, resp):
     systems_catalog = 0
     try:
         product_dict = json.loads(resp.text.encode('utf-8'))
         systems_catalog = resp.meta.get('systemsCatalog')
         total_match_count_string = util.intval(
             product_dict.get('totalMatchCountString'))
         pages = int(math.ceil(total_match_count_string / self.limit_num))
         for pageNum in xrange(1, pages + 1):
             self.form_data['pageNum'] = str(pageNum)
             yield Request(url=self.processData_url,
                           method='POST',
                           headers=self.headers,
                           body=json.dumps(self.form_data),
                           meta={'systemsCatalog': systems_catalog},
                           callback=self.parse_detail)
     except:
         logger.exception('Parse error, systemsCatalog: %s',
                          systems_catalog)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: login.py Projeto: LiBin-Chen/spider_demo

    def fetch_data(self):
        '''
        获取页面数据
        '''
        headers = self.headers if self.headers else DEFAULT_HEADER

        try:
            sess = requests.Session()
            print('获取url： {0}'.format(self.url))

            if self.method == 'GET':
                rs = sess.get(self.url,
                              headers=headers,
                              cookies=None,
                              timeout=30,
                              proxies=None)
            elif self.method == 'POST':
                rs = sess.post(self.url,
                               data=self.form_data,
                               headers=headers,
                               cookies=None,
                               timeout=30,
                               proxies=None)
            else:
                _logger.info('INFO:请求方法未定义 ; URL: {0}'.format(self.url))
            print('rs', rs)
            print(rs.text, rs.text)
        except Exception as e:
            # 将进行重试，可忽略
            _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                         (util.traceback_info(e), self.url))
            return -400

        if rs.status_code != 200:
            if rs.status_code == 404:
                _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % self.url)
                return 404

        # 强制utf-8
        # rs.encoding = 'utf-8'
        rs.encoding = rs.apparent_encoding
        return self._parse_detail_data(rs.content)

Exemplo n.º 27

0

Exibir arquivo

 def _fetch_data(self, fn, data_list=[], **kwargs):
     """获取数据"""
     try:
         data = fn(**kwargs)
         # 当function_name为fetch_data时失败或异常返回为状态值
         if isinstance(data, dict):
             data['id'] = kwargs['id']
             data['status'] = 200
             data_list.append(data)
         elif fn.func_name == 'fetch_data':
             del kwargs['headers']
             kwargs['status'] = data
             kwargs['count'] = kwargs.get('count', 1)
             if data in (404, 405):
                 kwargs['list'] = []
             data_list.append(kwargs)
         return data
     except Exception as e:
         print(util.binary_type(e))
         return None

Exemplo n.º 28

0

Exibir arquivo

Arquivo: site_tti_spider.py Projeto: Gzigithub/workspace

 def start_requests(self):
     match = []
     url = self.start_urls[0]
     rs = requests.get(url, headers=self.headers)
     js_cookies = {}
     for vo in rs.cookies:
         js_cookies[vo.name] = vo.value
     rs = requests.get(url, headers=self.headers, cookies=js_cookies)
     js_cookies = _parse_incapsula_page(rs.text,
                                        cookies=js_cookies,
                                        headers=self.headers)
     resp = requests.get(
         url='https://www.ttiinc.com/content/ttiinc/en/manufacturers.html',
         headers=self.headers,
         cookies=js_cookies)
     manufacturers = re.findall(
         r'(/content/ttiinc/en/manufacturers/.*/(.*).html)',
         resp.text.encode('utf-8'))
     for v, k in manufacturers:
         self.manufacturers[k] = util.urljoin(self.tti, v)
     rs = requests.get(url, headers=self.headers, cookies=js_cookies)
     match = re.findall(r'/.*/part-search.html.*systemsCatalog=(\d+)',
                        rs.text.encode('utf-8'))
     # if not match:
     #     with open(os.path.split(os.path.realpath(__file__))[0] + r'\tti_category_values.txt', 'r') as fp:
     #         for line in fp.readlines():
     #             match.append(line.strip())
     for systems_catalog in match:
         try:
             self.form_data['systemsCatalog'] = systems_catalog
             # print '*'*50
             # print self.form_data
             yield Request(url=self.processData_url,
                           method='POST',
                           headers=self.headers,
                           body=json.dumps(self.form_data),
                           meta={'systemsCatalog': systems_catalog})
         except:
             logger.exception('Request error, systemsCatalog: %s',
                              systems_catalog)

Exemplo n.º 29

0

Exibir arquivo

    def update_data(self, queue_name=None):
        """更新指定队列数据"""
        if not queue_name:
            return 0
        qsize = mq.qsize(queue_name)
        self.limit = self.limit if qsize > self.limit else qsize  # 每次更新的数量
        queue_list = []

        for i in range(self.limit):
            queue_data = mq.get(queue_name)
            if queue_data and queue_data not in queue_list:
                queue_list.append(queue_data)

        if not queue_list:
            print('等待中，队列 %s 为空' % queue_name)
            return 0
        proxy = None
        if not self.no_proxy:
            proxy = self.get_prolist()
        tlist = []
        data_list = []
        total_num = 0

        for data in queue_list:
            # 无效队列数据
            if 'id' not in data:
                continue
            if 'proxy' in data:
                del data['proxy']

            try:
                if len(tlist) > 30:
                    for t in tlist:
                        t.join(45)
            except (KeyboardInterrupt, SystemExit):
                mq.put(queue_name, queue_data)
                return 0

            # 有效队列的总数（非型号总数）
            total_num += 1
            t = threading.Thread(target=self.fetch_update_data,
                                 args=(data_list, proxy), kwargs=data)
            tlist.append(t)
            t.start()
            time.sleep(1)

        del data, queue_list
        valid_num = 0
        delete_list = []

        # 所有线程执行完毕后 再进行数据处理
        for data in data_list:
            if not data:
                continue
            if data['status'] == 200:
                mq.put(config.WAIT_UPDATE_QUEUE, data['dlist'])  # 等待提交数据
                valid_num += 1
                id = data.get('dlist').get('id', )
                lottery_name = data.get('dlist').get('lottery_name', )
                status = data.get('status')
                config.LOG.info('ID：{0} ;产品: {1} ;数据获取成功：{2} ;提交到入库队列:  {3} !'.format(id, lottery_name, status,
                                                                                      config.WAIT_UPDATE_QUEUE))
                continue
            else:
                delete_list.append(data)

            count = data.get('count', '')
            if count and count < self.exception_threshold:  # 重复更新的次数
                config.LOG.info('ID：%s，更新状态：%s, 重新入队中!' % (data.get('id', ), data['status']))
                # update_list.append(data)
                mq.put(queue_name, data)
            else:
                config.LOG.error('ID：%s，更新状态：%s, 重试次数超过阀值,保存日志中!' % (data.get('id', ), data['status']))
                if 'count' in data:
                    del data['count']
                if 'time' not in data:
                    data['time'] = util.date()
                # db.mongo['update_exception_logs'].insert(data)
                mq.put('update_exception_logs', data)

        self.write_update_info(valid_num)
        print('队列 %s 本次共有 %s 条数据更新成功，成功率：%s %%' %
              (queue_name, valid_num, valid_num * 1.0 / total_num * 100 if total_num > 0 else 0))
        print('完成 , 等待下一个队列!')

Exemplo n.º 30

0

Exibir arquivo

    def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs):
        """
        根据搜索关键词获取产品产品数据（可能为url也可能为详细信息）

        """
        if not supp or 'keyword' not in kwargs:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        keyword = util.u2b(kwargs['keyword'])
        supplier_name = config.DB_KEY[supp]
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if hasattr(obj, 'api_search_data'):
                _fetch_function = getattr(obj, 'api_search_data')
            else:
                _fetch_function = getattr(obj, 'fetch_search_data')
        except Exception as e:
            config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -401
                kwargs['count'] = kwargs.get('count', 1) + 1
                err_list.append(kwargs)
            return None
        data_dict = {
            'detail': [],
            'list': [],
            'url': []
        }
        if self.optype == 'hot' and self.use:
            kwargs['hot_search'] = True
        del kwargs['keyword']
        try:
            _fetch_function(keyword, supp, data_dict, headers, **kwargs)
        except Exception as e:
            config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -402
                kwargs['count'] = kwargs.get('count', 1) + 1
                kwargs['keyword'] = keyword
                err_list.append(kwargs)
            return None
        if data_dict['list']:
            try:
                _fetch_function = getattr(obj, 'fetch_search_list')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['list'], headers, proxy)
                if 'url' in res:
                    for url in res['url']:
                        data_dict['url'].append(url)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        if data_dict['url']:
            try:
                _fetch_function = getattr(obj, 'fetch_data')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['url'], headers, proxy)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        for data in data_dict['detail']:
            pass
            data_list.append(data)
            '''
            此处进行每条数据的清洗整理
            '''
        return data_list