def parse(self, response): try: xpath = '//li[1]/p[1]/a/@href' data = response.xpath(xpath).extract() if len(data) > 0: data = data[0] yield scrapy.Request(url=data, callback=self.get_infos, dont_filter=True) else: logger('e').error( get_log_msg( 'get_dangdang_contain_infos', 'isbn13=%s, not get dangdang second level href')) # 其他获取summary的方法先略 result = '' yield generate_item('summary', self.isbn13, result) except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'summary', e)))
def get_dangdang_contain_infos(self, response): item = response.meta['item'] try: xpath = '//li[1]/p[1]/a/@href' data = response.xpath(xpath).extract() if len(data) > 0: data = data[0] yield scrapy.Request(url=data, callback=self.get_infos, meta={'item': item}, dont_filter=True) else: logger().info( get_log_msg( 'get_dangdang_contain_infos', 'isbn13=%s, not get dangdang second level href')) yield item except Exception as e: logger().info( get_log_msg('get_dangdang_contain_infos', 'isbn13=%s, get exception e.msg=%s' % e)) yield item
def process_item(self, item, spider): #这个函数的参数必须这样写 spider_name = spider.name msg = 'data update fail, isbn13:%s, spider_name:%s, spider_value:%s' % ( item['isbn13'], spider_name, '') result = 'fail' if item._values: try: if spider_name == 'all_infos': item._values.update( {'last_update_time': get_current_timestamp_str('m')}) insert_bookbaseinfos(item._values) msg = 'data or last_update_time update succes, isbn13:%s, spider_name:%s, spider_value:%s' % ( item['isbn13'], spider_name, str(item._values)) result = 'success' else: value = item._values.get(spider_name) if value: update_bookbaseinfos(item._values) msg = 'data or last_update_time update succes, isbn13:%s, spider_name:%s, spider_value:%s' % ( item['isbn13'], spider_name, value) result = 'success' except Exception as e: msg = ('%s' % e) + ',' + msg msg = get_log_msg('process_item', msg) if result == 'success': logger().info(msg) else: logger('e').error(msg)
def get_classfication(self, response): try: url1 = response.url xpath1 = '' if url1.find('e.dangdang.com') != -1: xpath1 = '//*[@id="productBookDetail"]/div[3]/p[5]/span/a/text()' else: xpath1 = '//*[@id="detail-category-path"]/span/a/text()' data2 = response.xpath(xpath1).extract() if not data2: data2 = get_data_by_selenium('taobao', self.search_text, 'classfication') result = check_data_validity('classfication', '>'.join(data2)) yield generate_item('classfication', self.isbn13, result) except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, data2=%s, e.msg=%s' % (self.isbn13, 'classfication', data2, e)))
def get_data_by_selenium(domain, search_txt, search_type): try: detail_href_url_list = get_detail_href_list(domain, search_txt) for i, detail_href_url in enumerate(detail_href_url_list): bs4html = get_bs4html_by_chromedriver(detail_href_url) detail_data = get_detail_data_from_bs4html(domain, bs4html, search_type) if detail_data: return detail_data if i == 2: break return '' except: logger('e').error( get_log_msg( 'get_data_by_selenium', 'domain=%s, search_txt=%s, search_type=%s' % (domain, search_txt, search_type))) return ''
def get_detail_href_list(domain, search_txt): try: url = get_url(domain, search_txt) bs4html = get_bs4html_by_chromedriver(url) include_a_div_list = get_element_from_bs4html(domain, bs4html) title_list = [] href_list = [] for i, div in enumerate(include_a_div_list): if i <= 4: href_list.append(div.find('a').get('href')) title_list.append(div.find('a').get_text().strip()) else: break max_sim_index_list = get_max_sim_index(title_list, search_txt) return [ check_href_url(domain, href_list[max_sim_index]) for max_sim_index in max_sim_index_list ] except: logger('e').error( get_log_msg('get_detail_href_list', 'domain=%s, search_txt=%s' % (domain, search_txt))) return ''
def query_trans_name(title): sql = "select trans_name from %s where title='%s'" % ( TABLE_NAME_BASE_INFOS, title) try: return excute_query_sql(sql) except Exception as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql))
def insert_bookbaseinfos(ditc_bookbaseinfos): table_name = 'book_base_info' sql = generate_insert_sql(table_name, ditc_bookbaseinfos) try: excute_no_query_sql(sql) except Exception as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql))
def query_title(isbn13): sql = "select title from %s where isbn13='%s'" % (TABLE_NAME_BASE_INFOS, isbn13) try: return excute_query_sql(sql) except Exception as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql))
def parse(self, response): try: xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/span[2]/text()' xpath1 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[2]/span[2]/text()' result = get_xpath_result(response, 'price', [xpath, xpath1]) yield generate_item('price', self.isbn13, result) except Exception as e: logger('e').error(get_log_msg('parse', 'isbn13=%s, spider_name=%s, data=%s, e.msg=%s' % (self.isbn13, 'price', result, e)))
def parse(self, response): try: yield generate_item( 'trans_name', self.isbn13, get_trans_name_by_google_translate(self.isbn13)) except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'trans_name', e)))
def excute_no_query_sql(sql): db = connect_mysql() cursor = db.cursor() try: cursor.execute(sql) db.commit() # 提交当前事务 except pymysql.Error as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql)) finally: db.close()
def query_list_isbn(list_isbn): sql = "select * from %s where isbn13='%s'" % (TABLE_NAME_BASE_INFOS, list_isbn[0]) for isbn in list_isbn[1:]: sql += (" or isbn13='%s'" % isbn) try: return excute_query_sql(sql) except Exception as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql))
def get_infos(self, response): try: summary_xpath = '//*[@id="description"]/div[2]/div[1]/div[2]/text()' summary = response.xpath(summary_xpath).extract() result = check_sql_str(summary[0]) if len(summary) > 0 else '' yield generate_item('summary', self.isbn13, result) except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'summary', e)))
def scrap_bookinfos(isbn13): try: if myutils.check_isbn(isbn13): result = sqlutils.query_list_isbn([isbn13]) if len(result) > 0 and result[0] != '': # 更新一些可能变化的数据,暂时不更新 # logger.info('func:scrap_bookinfos, isbn already in database:%s:' % isbn) # 判断是否存在空的必填值,如果有这要查询剩余值 book_base_infos = convert_db_data_to_bookbaseinfos(result[0]) if not check_data_integrity(book_base_infos): scrapy_api_unable_get_infos(book_base_infos) else: #开始爬取数据,先从api获取 book_infos = myutils.query_book_infos(isbn13, company_code=1) if book_infos: logger().info(get_log_msg('scrap_bookinfos', 'isbn13=%s,book_infos=%s' % (isbn13, book_infos))) #获取api查询中的数据 book_base_infos = get_book_base_infos_from_api(book_infos) sqlutils.insert_bookbaseinfos(myutils.obj2dict(book_base_infos)) # 如果查询到就去之前的unfound列表删除unfound的记录 # myutils.update_unfound_isbn13_to_txt(isbn13) scrapy_api_unable_get_infos(book_base_infos) else: #全部数据都需要爬取,暂时不做 logger().info('API数据库没有该ISBN数据信息:%s,尝试从网页爬取' % isbn13) scrapy_all_infos(isbn13) else: logger().info(get_log_msg('scrap_bookinfos', 'isbn13 len invalid or in unfound_isbn13.txt: isbn13=%s' % isbn13)) except Exception as e: logger('e').error(get_log_msg('scrap_bookinfos', 'isbn13 scrap_bookinfos exception, isbn13=%s, e.msg=%s' % (isbn13, e)))
def excute_query_sql(sql): db = connect_mysql() cur = db.cursor() try: cur.execute(sql) # 执行sql语句 results = cur.fetchall() # 获取查询的所有记录 if len(results) == 0: return [''] else: if results[0][0] == '': return [''] else: return results except Exception as e: raise Exception(('%s' % e) + get_log_msg('insert_bookbaseinfos', 'sql=%s' % sql)) finally: db.close()
def main(list_isbn): #初始化log myutils.init_logging() #判断是否需要更新proxies.txt列表 # myutils.update_proxies_txt() isbn_list = list_isbn spider_thread_list = [] count = 1 for isbn in isbn_list: if count <= 1000: spider_thread_list.append( SpiderThread('thread-%d-%s' % (count, isbn), isbn)) count += 1 else: logger('w').warning(get_log_msg("fun:main", "query count > 100")) break for thread in spider_thread_list: thread.start() for thread in spider_thread_list: thread.join()
def parse(self, response): try: xpath = '//li[1]/p[1]/a/@href' data = response.xpath(xpath).extract() if len(data) > 0: data = data[0] yield scrapy.Request(data, callback=self.get_classfication, dont_filter=True) else: # 当当上面查询不到或者访问受限,转到淘宝、 data2 = get_data_by_selenium('taobao', self.search_text, 'classfication') result = check_data_validity('classfication', '>'.join(data2)) yield generate_item('classfication', self.isbn13, result) except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'classfication', e)))
def get_bs4html_by_chromedriver(url): option = webdriver.ChromeOptions() option.add_argument('headless') driver = webdriver.Chrome(chrome_options=option) try: driver.get(url) #耗时较长 page_source = driver.page_source if len(page_source) > 300: #避免返回的html文本内容不全 return BeautifulSoup(page_source, 'html.parser') else: return '' except Exception as e: logger('e').error( get_log_msg('get_bs4html_by_chromedriver', 'url=%s' % url)) return '' finally: driver.quit()
def get_infos(self, response): try: item = response.meta['item'] url1 = response.url if url1.find('e.dangdang.com') != -1: classfication_xpath = '//*[@id="productBookDetail"]/div[3]/p[5]/span/a/text()' else: classfication_xpath = '//*[@id="detail-category-path"]/span/a/text()' classfication = response.xpath(classfication_xpath).extract() if not classfication: search_text = get_valid_search_text(item['title']) classfication = get_data_by_selenium('taobao', search_text, 'classfication') item['classfication'] = check_sql_str( check_data_validity('classfication', '>'.join(classfication))) summary_xpath = '//*[@id="description"]/div[2]/div[1]/div[2]/text()' summary = response.xpath(summary_xpath).extract() item['summary'] = check_sql_str( summary[0]) if len(summary) > 0 else '' yield item except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, get exception in second level, e.msg=%s' % (self.isbn13, e))) yield item
def parse(self, response): try: if response.status == 200: item = AllInfosItem() item['isbn13'] = self.isbn13 title_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/a/@title' title = response.xpath(title_xpath).extract() if title: item['title'] = check_sql_str( title[0]) if len(title) > 0 else '' pic_xpath = '//*[@id="result_0"]/div/div/div/div[1]/div/div/a/img/@src' pic = response.xpath(pic_xpath).extract() item['pic'] = pic[0] if len(pic) > 0 else '' pubdate_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/span[3]/text()' pubdate = response.xpath(pubdate_xpath).extract() item['pubdate'] = pubdate[0] if len(pubdate) > 0 else '' author_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[2]/span[position()>1]/text()' author = response.xpath(author_xpath).extract() item['author'] = check_sql_str( ''.join(author)) if len(author) > 0 else '' binding_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[1]/a/h3/text()' binding = response.xpath(binding_xpath).extract() item['binding'] = binding[0] if len(binding) > 0 else '' price_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/span[2]/text()' price = response.xpath(price_xpath).extract() item['price'] = check_data_validity( 'price', price[0]) if len(price) > 0 else '' currency_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[2]/div[1]/div[2]/a/span[2]/span/sup[1]/text()' currency_xpath1 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[2]/a/span[2]/span/sup[1]/text()' currency_xpath2 = '//*[@id="result_0"]/div/div/div/div[2]/div[3]/div[1]/div[1]/a/span[2]/span/sup[1]/text()' item['currency'] = get_xpath_result( response, 'currency', [currency_xpath, currency_xpath1, currency_xpath2]) amazon_sec_href_xpath = '//*[@id="result_0"]/div/div/div/div[2]/div[1]/div[1]/a/@href' amazon_sec_href = response.xpath( amazon_sec_href_xpath).extract() if len(amazon_sec_href) > 0: bs4html = get_bs4html_by_chromedriver( amazon_sec_href[0]) try: li_list = bs4html.find('table', { 'id': "productDetailsTable" }).findAll('li') for li in li_list: b = li.find('b') if b and b.get_text().find('Paperback') != -1: item['page'] = li.get_text() if b and b.get_text().find('Publisher') != -1: item['publisher'] = check_sql_str( ((str)(li.contents[1])).strip()) if b and b.get_text().find('ISBN-10') != -1: item['isbn10'] = check_isbn10( ((str)(li.contents[1])).strip()) except Exception as e: logger('e').error( get_log_msg( 'parse', 'get page、piblisher、isbn10 fail, isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'all_infos', e))) if len(title) > 0: item['trans_name'] = check_sql_str( google_translate(title[0])) self.url_code = urllib.parse.quote( get_valid_search_text(title[0])) dangdang_urls = 'http://search.dangdang.com/?key=' + self.url_code + '&act=input' yield scrapy.Request( url=dangdang_urls, callback=self.get_dangdang_contain_infos, meta={'item': item}, dont_filter=True) else: yield item # else: # myutils.update_unfound_isbn13_to_txt(self.isbn13, 'i') # else: # myutils.update_unfound_isbn13_to_txt(self.isbn13, 'i') except Exception as e: logger('e').error( get_log_msg( 'parse', 'isbn13=%s, spider_name=%s, e.msg=%s' % (self.isbn13, 'all_infos', e)))