Пример #1
0
    def process_item(self, item, spider):  #这个函数的参数必须这样写

        spider_name = spider.name

        msg = 'data update fail, isbn13:%s, spider_name:%s, spider_value:%s' % (
            item['isbn13'], spider_name, '')

        result = 'fail'
        if item._values:

            try:
                if spider_name == 'all_infos':
                    item._values.update(
                        {'last_update_time': get_current_timestamp_str('m')})
                    insert_bookbaseinfos(item._values)
                    msg = 'data or last_update_time update succes, isbn13:%s, spider_name:%s, spider_value:%s' % (
                        item['isbn13'], spider_name, str(item._values))
                    result = 'success'
                else:
                    value = item._values.get(spider_name)
                    if value:
                        update_bookbaseinfos(item._values)
                        msg = 'data or last_update_time update succes, isbn13:%s, spider_name:%s, spider_value:%s' % (
                            item['isbn13'], spider_name, value)
                        result = 'success'
            except Exception as e:
                msg = ('%s' % e) + ',' + msg

        msg = get_log_msg('process_item', msg)

        if result == 'success':
            logger().info(msg)
        else:
            logger('e').error(msg)
    def get_classfication(self, response):
        try:
            url1 = response.url

            xpath1 = ''
            if url1.find('e.dangdang.com') != -1:
                xpath1 = '//*[@id="productBookDetail"]/div[3]/p[5]/span/a/text()'
            else:
                xpath1 = '//*[@id="detail-category-path"]/span/a/text()'

            data2 = response.xpath(xpath1).extract()

            if not data2:
                data2 = get_data_by_selenium('taobao', self.search_text,
                                             'classfication')

            result = check_data_validity('classfication', '>'.join(data2))

            yield generate_item('classfication', self.isbn13, result)

        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, data2=%s, e.msg=%s' %
                    (self.isbn13, 'classfication', data2, e)))
Пример #3
0
def scrapy_all_infos(isbn13):
    spider_name = 'all_infos'
    t = SpiderStartThread('thread-%s-%s' % (spider_name, isbn13), isbn13, spider_name)
    logger().info('线程%s开始爬取,isbn13=%s, spider_name=%s' % (t.name, isbn13, spider_name))
    t.start()
    t.join()
    logger().info('线程%s爬取结束,isbn13=%s, spider_name=%s' % (t.name, isbn13, spider_name))
Пример #4
0
def get_data_by_selenium(domain, search_txt, search_type):

    try:
        detail_href_url_list = get_detail_href_list(domain, search_txt)

        for i, detail_href_url in enumerate(detail_href_url_list):

            bs4html = get_bs4html_by_chromedriver(detail_href_url)

            detail_data = get_detail_data_from_bs4html(domain, bs4html,
                                                       search_type)

            if detail_data:
                return detail_data
            if i == 2:
                break

        return ''
    except:
        logger('e').error(
            get_log_msg(
                'get_data_by_selenium',
                'domain=%s, search_txt=%s, search_type=%s' %
                (domain, search_txt, search_type)))
        return ''
Пример #5
0
def get_detail_href_list(domain, search_txt):

    try:
        url = get_url(domain, search_txt)

        bs4html = get_bs4html_by_chromedriver(url)

        include_a_div_list = get_element_from_bs4html(domain, bs4html)

        title_list = []

        href_list = []

        for i, div in enumerate(include_a_div_list):
            if i <= 4:
                href_list.append(div.find('a').get('href'))
                title_list.append(div.find('a').get_text().strip())
            else:
                break

        max_sim_index_list = get_max_sim_index(title_list, search_txt)

        return [
            check_href_url(domain, href_list[max_sim_index])
            for max_sim_index in max_sim_index_list
        ]
    except:
        logger('e').error(
            get_log_msg('get_detail_href_list',
                        'domain=%s, search_txt=%s' % (domain, search_txt)))
        return ''
Пример #6
0
    def parse(self, response):

        try:
            xpath = '//li[1]/p[1]/a/@href'

            data = response.xpath(xpath).extract()

            if len(data) > 0:
                data = data[0]
                yield scrapy.Request(url=data,
                                     callback=self.get_infos,
                                     dont_filter=True)
            else:
                logger('e').error(
                    get_log_msg(
                        'get_dangdang_contain_infos',
                        'isbn13=%s, not get dangdang second level href'))

                # 其他获取summary的方法先略
                result = ''
                yield generate_item('summary', self.isbn13, result)

        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' %
                    (self.isbn13, 'summary', e)))
Пример #7
0
def start_scrapy(spider_name, isbn13):
    #在线程里启动爬虫
    t = SpiderStartThread('thread-%s-%s' % (spider_name, isbn13), isbn13, spider_name)
    logger().info('线程%s开始爬取,isbn13=%s, spider_name=%s' % (t.name, isbn13, spider_name))
    t.start()
    t.join()
    logger().info('线程%s爬取结束,isbn13=%s, spider_name=%s' % (t.name, isbn13, spider_name))
Пример #8
0
    def get_dangdang_contain_infos(self, response):

        item = response.meta['item']

        try:
            xpath = '//li[1]/p[1]/a/@href'

            data = response.xpath(xpath).extract()

            if len(data) > 0:
                data = data[0]
                yield scrapy.Request(url=data,
                                     callback=self.get_infos,
                                     meta={'item': item},
                                     dont_filter=True)
            else:
                logger().info(
                    get_log_msg(
                        'get_dangdang_contain_infos',
                        'isbn13=%s, not get dangdang second level href'))
                yield item
        except Exception as e:
            logger().info(
                get_log_msg('get_dangdang_contain_infos',
                            'isbn13=%s, get exception e.msg=%s' % e))
            yield item
Пример #9
0
    def parse(self, response):
        try:
            xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/span[2]/text()'
            xpath1 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[2]/span[2]/text()'

            result = get_xpath_result(response, 'price', [xpath, xpath1])

            yield generate_item('price', self.isbn13, result)
        except Exception as e:
            logger('e').error(get_log_msg('parse', 'isbn13=%s, spider_name=%s, data=%s, e.msg=%s'
                                          % (self.isbn13, 'price', result, e)))
    def parse(self, response):
        try:
            yield generate_item(
                'trans_name', self.isbn13,
                get_trans_name_by_google_translate(self.isbn13))
        except Exception as e:

            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' %
                    (self.isbn13, 'trans_name', e)))
Пример #11
0
    def get_infos(self, response):
        try:
            summary_xpath = '//*[@id="description"]/div[2]/div[1]/div[2]/text()'

            summary = response.xpath(summary_xpath).extract()

            result = check_sql_str(summary[0]) if len(summary) > 0 else ''

            yield generate_item('summary', self.isbn13, result)

        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' %
                    (self.isbn13, 'summary', e)))
Пример #12
0
def main(list_isbn):
    #初始化log
    myutils.init_logging()
    #判断是否需要更新proxies.txt列表
    # myutils.update_proxies_txt()
    isbn_list = list_isbn
    spider_thread_list = []
    count = 1
    for isbn in isbn_list:
        if count <= 1000:
            spider_thread_list.append(
                SpiderThread('thread-%d-%s' % (count, isbn), isbn))
            count += 1
        else:
            logger('w').warning(get_log_msg("fun:main", "query count > 100"))
            break

    for thread in spider_thread_list:
        thread.start()

    for thread in spider_thread_list:
        thread.join()
    def parse(self, response):
        try:
            xpath = '//li[1]/p[1]/a/@href'

            data = response.xpath(xpath).extract()

            if len(data) > 0:
                data = data[0]
                yield scrapy.Request(data,
                                     callback=self.get_classfication,
                                     dont_filter=True)
            else:  # 当当上面查询不到或者访问受限,转到淘宝、
                data2 = get_data_by_selenium('taobao', self.search_text,
                                             'classfication')

                result = check_data_validity('classfication', '>'.join(data2))

                yield generate_item('classfication', self.isbn13, result)
        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' %
                    (self.isbn13, 'classfication', e)))
Пример #14
0
def get_bs4html_by_chromedriver(url):

    option = webdriver.ChromeOptions()

    option.add_argument('headless')

    driver = webdriver.Chrome(chrome_options=option)
    try:

        driver.get(url)  #耗时较长

        page_source = driver.page_source

        if len(page_source) > 300:  #避免返回的html文本内容不全

            return BeautifulSoup(page_source, 'html.parser')
        else:
            return ''
    except Exception as e:
        logger('e').error(
            get_log_msg('get_bs4html_by_chromedriver', 'url=%s' % url))
        return ''
    finally:
        driver.quit()
Пример #15
0
    def get_infos(self, response):
        try:
            item = response.meta['item']

            url1 = response.url

            if url1.find('e.dangdang.com') != -1:
                classfication_xpath = '//*[@id="productBookDetail"]/div[3]/p[5]/span/a/text()'
            else:
                classfication_xpath = '//*[@id="detail-category-path"]/span/a/text()'

            classfication = response.xpath(classfication_xpath).extract()

            if not classfication:
                search_text = get_valid_search_text(item['title'])

                classfication = get_data_by_selenium('taobao', search_text,
                                                     'classfication')

            item['classfication'] = check_sql_str(
                check_data_validity('classfication', '>'.join(classfication)))

            summary_xpath = '//*[@id="description"]/div[2]/div[1]/div[2]/text()'
            summary = response.xpath(summary_xpath).extract()
            item['summary'] = check_sql_str(
                summary[0]) if len(summary) > 0 else ''

            yield item

        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse',
                    'isbn13=%s, get exception in second level, e.msg=%s' %
                    (self.isbn13, e)))
            yield item
Пример #16
0
def scrap_bookinfos(isbn13):
    try:
        if myutils.check_isbn(isbn13):
            result = sqlutils.query_list_isbn([isbn13])
            if len(result) > 0 and result[0] != '':
                # 更新一些可能变化的数据,暂时不更新
                # logger.info('func:scrap_bookinfos, isbn already in database:%s:' % isbn)
                # 判断是否存在空的必填值,如果有这要查询剩余值
                book_base_infos = convert_db_data_to_bookbaseinfos(result[0])
                if not check_data_integrity(book_base_infos):
                    scrapy_api_unable_get_infos(book_base_infos)
            else:
                #开始爬取数据,先从api获取
                book_infos = myutils.query_book_infos(isbn13, company_code=1)

                if book_infos:

                    logger().info(get_log_msg('scrap_bookinfos', 'isbn13=%s,book_infos=%s' % (isbn13, book_infos)))
                    #获取api查询中的数据
                    book_base_infos = get_book_base_infos_from_api(book_infos)
                    sqlutils.insert_bookbaseinfos(myutils.obj2dict(book_base_infos))
                    # 如果查询到就去之前的unfound列表删除unfound的记录
                    # myutils.update_unfound_isbn13_to_txt(isbn13)
                    scrapy_api_unable_get_infos(book_base_infos)
                else:
                    #全部数据都需要爬取,暂时不做
                    logger().info('API数据库没有该ISBN数据信息:%s,尝试从网页爬取' % isbn13)
                    scrapy_all_infos(isbn13)
        else:
            logger().info(get_log_msg('scrap_bookinfos',
                                      'isbn13 len invalid or in unfound_isbn13.txt: isbn13=%s'
                                      % isbn13))
    except Exception as e:
        logger('e').error(get_log_msg('scrap_bookinfos',
                                      'isbn13 scrap_bookinfos exception, isbn13=%s, e.msg=%s'
                                      % (isbn13, e)))
Пример #17
0
    def parse(self, response):
        try:
            if response.status == 200:
                item = AllInfosItem()

                item['isbn13'] = self.isbn13

                title_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/a/@title'
                title = response.xpath(title_xpath).extract()

                if title:

                    item['title'] = check_sql_str(
                        title[0]) if len(title) > 0 else ''

                    pic_xpath = '//*[@id="result_0"]/div/div/div/div[1]/div/div/a/img/@src'
                    pic = response.xpath(pic_xpath).extract()
                    item['pic'] = pic[0] if len(pic) > 0 else ''

                    pubdate_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/span[3]/text()'
                    pubdate = response.xpath(pubdate_xpath).extract()
                    item['pubdate'] = pubdate[0] if len(pubdate) > 0 else ''

                    author_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[2]/span[position()>1]/text()'
                    author = response.xpath(author_xpath).extract()
                    item['author'] = check_sql_str(
                        ''.join(author)) if len(author) > 0 else ''

                    binding_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[1]/a/h3/text()'
                    binding = response.xpath(binding_xpath).extract()
                    item['binding'] = binding[0] if len(binding) > 0 else ''

                    price_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/span[2]/text()'
                    price = response.xpath(price_xpath).extract()
                    item['price'] = check_data_validity(
                        'price', price[0]) if len(price) > 0 else ''

                    currency_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/a/span[2]/span/sup[1]/text()'
                    currency_xpath1 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[2]/a/span[2]/span/sup[1]/text()'
                    currency_xpath2 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[1]/a/span[2]/span/sup[1]/text()'
                    item['currency'] = get_xpath_result(
                        response, 'currency',
                        [currency_xpath, currency_xpath1, currency_xpath2])

                    amazon_sec_href_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/a/@href'
                    amazon_sec_href = response.xpath(
                        amazon_sec_href_xpath).extract()
                    if len(amazon_sec_href) > 0:
                        bs4html = get_bs4html_by_chromedriver(
                            amazon_sec_href[0])
                        try:
                            li_list = bs4html.find('table', {
                                'id': "productDetailsTable"
                            }).findAll('li')
                            for li in li_list:
                                b = li.find('b')
                                if b and b.get_text().find('Paperback') != -1:
                                    item['page'] = li.get_text()
                                if b and b.get_text().find('Publisher') != -1:
                                    item['publisher'] = check_sql_str(
                                        ((str)(li.contents[1])).strip())
                                if b and b.get_text().find('ISBN-10') != -1:
                                    item['isbn10'] = check_isbn10(
                                        ((str)(li.contents[1])).strip())
                        except Exception as e:
                            logger('e').error(
                                get_log_msg(
                                    'parse',
                                    'get page、piblisher、isbn10 fail, isbn13=%s, spider_name=%s, e.msg=%s'
                                    % (self.isbn13, 'all_infos', e)))

                    if len(title) > 0:
                        item['trans_name'] = check_sql_str(
                            google_translate(title[0]))
                        self.url_code = urllib.parse.quote(
                            get_valid_search_text(title[0]))
                        dangdang_urls = 'http://search.dangdang.com/?key=' + self.url_code + '&act=input'
                        yield scrapy.Request(
                            url=dangdang_urls,
                            callback=self.get_dangdang_contain_infos,
                            meta={'item': item},
                            dont_filter=True)
                    else:
                        yield item
                # else:
                # myutils.update_unfound_isbn13_to_txt(self.isbn13, 'i')
            # else:
            # myutils.update_unfound_isbn13_to_txt(self.isbn13, 'i')
        except Exception as e:
            logger('e').error(
                get_log_msg(
                    'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' %
                    (self.isbn13, 'all_infos', e)))