def xt_pres_date(cls, raw_person): # Extract administration admin_datestring = Selector(text=raw_person).xpath( '//td[2]/text()').extract()[0] try: if " - " in admin_datestring: start_date = _clean(admin_datestring.split(' - ')[0]) end_date = _clean(admin_datestring.split(' - ')[1]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() end_date = datetime.datetime.strptime( end_date, "%d.%m.%Y").date() else: start_date = datetime.datetime.strptime( _clean(admin_datestring.replace(' -','')), "%d.%m.%Y").date() end_date = None except: logger.error( "Couldn't extract date from datestring {}".format( admin_datestring)) import ipdb ipdb.set_trace() return (start_date, end_date)
def parse_subPage(self, response): item = LuyiluImgItem() item['img_url'] = Selector(response).xpath('//img[contains(@src,"images.") and not(@class="thumb")]/@src').extract() cur_title= Selector(response).xpath('//h1/text()').extract_first() par = re.compile('\(\d*\)') rst = par.findall(cur_title) if len(rst) > 0: rst = rst[0] else:rst = '' item['title'] = cur_title.replace(rst,'') item['url'] = response.url next_suburl = Selector(response).xpath('//li[@class="next-page"]/a/@href').extract_first(default=None) if not next_suburl == None: next_pagurl = response.url.replace(response.url.split('/')[-1], next_suburl) else: next_pagurl = response.url yield Request(next_pagurl, callback= self.parse_subPage) yield item rela_url = Selector(response).xpath('//a[contains(@href,"/20")]/@href').extract() for url in rela_url: par = re.compile('youfanhao') par2 =re.compile('xiurenwang') third_par = re.compile('youmihui') chedan = re.compile('xiachedan') chuchu = re.compile('chuchu') no_true = len(par.findall(url)) + len(par2.findall(url)) + len(third_par.findall(url)) no_true += len(chedan.findall(url)) + len (chuchu.findall(url)) if no_true == 0: yield Request('http://yxpjwnet1.com'+url, callback = self.parse_subPage)
def person(self, response): """当前公司所有人员url""" # 获取当前表里的所有数据 mycontinue = True tr = Selector(response=response).xpath('//tbody/tr') # 获取当前有多少数据 all_date = Selector(response=response).xpath( '//div[@class="comp_regstaff_links"]/a[1]/span/text()' ).extract_first() # 去除不需要的 one_name = Selector(response=response).xpath( '//tbody/tr[1]/td[2]/a/text()').extract_first() all_date = all_date.replace(')', '') all_date = int(all_date.replace('(', '')) if all_date == 0: print('----公司无人员\n\n') return 'zz' if all_date < 26: # logging.info( '------人员无分页\n\n') # logging.error( '------人员无分页\n\n') mycontinue = False # 算出有能有多少页 self.page = all_date // 25 + 2 # 拿出所有的人员的A标签属性 for r in tr: one_person = r.xpath('./td/a/@onclick').extract_first() if not one_person == None: person_url = one_person.split('top.window.location.href=\'')[1] person_url = person_url.split('\'')[0] person_url = self.big_url + person_url time.sleep(0.5) yield Request(url=person_url, callback=self.person_detailed) # 查看是否有分页 another_page = Selector( response=response).xpath('//div[@class="clearfix"]') # 如果不够分页或者,没有分页选择器这不执行 if not another_page == [] and mycontinue: for a in range(2, self.page): print(a) a = str(a) yield scrapy.FormRequest(response.url, formdata={'$pg': a}, callback=self.person) # 只循环一次 mycontinue = False
def get_profile_links(page_content: str) -> List[tuple]: profile_selector = "#companies-column > ul > li a" profile_links = Selector(text=page_content).css(profile_selector).extract() results = [] for link in profile_links: href = Selector(text=link).css("a::attr(href)").extract()[0] company_title = Selector(text=link).css("h3::text").extract()[0] clean_company_title = escape_html(company_title.replace(" ", " ")).lower() results.append((clean_company_title, href)) return results
def parse(self, response): # print(response.body) video_list = response.xpath( "//div[@class='search-video-wrap']/div[@class='video-list clearfix add-quick-recommend']/ul/li" ).extract() video_item = VideoItem() for item in video_list: video_url = Selector(text=item).xpath( "//a/div[@class='video-box']/video/@data-original" ).extract_first() video_url = video_url.replace("_10s", "") video_item['video_url'] = 'https:' + video_url video_title = Selector(text=item).xpath( "//a[@class='video-name fl']/h3/text()").extract_first() video_item['video_title'] = video_title video_time = Selector(text=item).xpath( "//a[@class='video-name fl']/span[@class='video-time']/text()" ).extract_first() video_item['video_time'] = video_time yield video_item
def google_selector(self, response): base_url = "https://www.google.com.mx/" snippets = response.xpath("//div[@class='g']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: num_snippet = num_snippet + 1 storage_item = UsmItem() title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract() cite = Selector(text=snippet).xpath("//cite").extract() # cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath("//span[@class='st']").extract() if title.__len__() >= 2: title = title[0]+title[1] else: title="" if cite.__len__() > 0: # cite = cite[0].split("url?q=")[-1] cite = cite[0] for r in ['<cite>', '</cite>', '<b>', '</b>']: cite = cite.replace(r, '') else: cite="" if text.__len__() > 0: text = text[0] for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>']: text = text.replace(r, '') else: text = "" if cite != "": self.log("---------------------------------") self.log("--------------TITLE--------------") self.log(title) self.log("-------------CITE----------------") self.log(cite) self.log("---------------TEXT--------------") self.log(text) self.log("------------ID PERSON------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//td/b/text()").extract() self.log("-----------NUMBER OF PAGE-----") self.log(number[0] + "") if int(number[0]) < 6: res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.google_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request
def parse_detail_info(self, response): detail_info = response.text if ('http://tech.sina.com.cn' in response.url) or ('https://tech.sina.com.cn' in response.url): url = response.url.strip() title = response.meta['title'] content = ''.join( Selector(text=detail_info).xpath( '//div[@id="artibody"]/p').extract()) pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = '' if len( Selector(text=detail_info).xpath( '//div[@id="keywords"]/a/text()').extract()) > 0: keywords = ','.join( Selector(text=detail_info).xpath( '//div[@id="keywords"]/a/text()').extract()) else: keywords = ','.join( Selector(text=detail_info).xpath( '//p[@class="art_keywords"]/a/text()').extract()) hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = '' if len( Selector(text=detail_info).xpath( '//span[@class="date"]/text()').extract()) > 0: update = Selector(text=detail_info).xpath( '//span[@class="date"]/text()').extract()[0].strip() else: update = Selector(text=detail_info).xpath( '//span[@id="pub_date"]/text()').extract()[0].strip() batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ('http://report.iresearch.cn' in response.url): url = response.url title = response.meta['title'] content = response.meta['content'] pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' pdf_price = Selector(text=detail_info).xpath( '//li[@class="price"]/text()').extract()[0] pdf_url = 'http://report.iresearch.cn/include/ajax/user_ajax.ashx?reportid=' + str( url[url.rfind('/') + 1:-6]) + '&work=rdown&url=' + url if '¥0' == pdf_price: pdf_content = yield Request(pdf_url) # self.save_pdf(pdf_content) pdf = base64.b64encode(pdf_content.body) keywords = response.meta['keywords'] hot = response.meta['hot'] type = response.meta['type'] update = response.meta['update'] batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ('https://new.qq.com' in response.url): # 解析腾讯科技详情 url = response.url.strip() title = response.meta['title'] content = ''.join( Selector(text=detail_info).xpath( '//div[@class="content-article"]/p').extract()) if len(content) != 0: pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector(text=detail_info).xpath( '//meta[@name="keywords"]/@content').extract( )[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = '' update = detail_info.split('pubtime": "') update = update[1].split('",')[0] batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ("https://www.toutiao.com" in response.url): #头条详情解析 url = response.url.strip() title = Selector(text=detail_info).xpath( '//title/text()').extract()[0].strip() content = detail_info.split('content: \'') content = content[1].split('groupId: \'')[0] content = content.replace(";',", "") content = content.encode("utf-8") pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector(text=detail_info).xpath( '//meta[@name="keywords"]/@content').extract( )[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = '' update = detail_info.split("time: '") update = update[1].split("'")[0] batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ("https://www.36kr.com" in response.url): # 36氪解析详情 url = response.url.strip() title = Selector(text=detail_info).xpath( '//title/text()').extract()[0].strip() title = title.replace('_36氪', '') content = ''.join( Selector(text=detail_info).xpath( '//div[@class="common-width content articleDetailContent"]/p' ).extract()) pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector(text=detail_info).xpath( '//meta[@name="keywords"]/@content').extract( )[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = response.meta['update'] batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ('iresearch.cn' in response.url): # 艾瑞网详情解析 url = response.url.strip() title = Selector(text=detail_info).xpath( '//title/text()').extract()[0].strip() title = title.replace('_互联网_艾瑞网', '') content = ''.join( Selector(text=detail_info).xpath( '//div[@class="m-article"]/p').extract( )) pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector(text=detail_info).xpath( '//meta[@name="keywords"]/@content' ).extract()[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = Selector(text=detail_info).xpath( '//div[@class="box"]//div[@class="origin"]//em/text()' ).extract()[0].strip() batch = self.batch table_name = 'spider.news' yield self.save_result(batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ('http://www.sohu.com' in response.url): # 搜狐科技解析详情 url = response.url.strip() title = Selector(text=detail_info).xpath( '//title/text()').extract()[0].strip() content = ''.join( Selector(text=detail_info).xpath( '//article[@class="article"]/p'). extract()) pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector( text=detail_info).xpath( '//meta[@name="keywords"]/@content' ).extract()[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = Selector(text=detail_info).xpath( '//div[@class="article-info"]//span[@class="time"]/text()' ).extract()[0].strip() batch = self.batch table_name = 'spider.news' yield self.save_result( batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item() else: if ('http://www.tmtpost.com/' in response.url): # 钛媒体详情解析 url = response.url.strip() title = Selector( text=detail_info).xpath( '//title/text()').extract( )[0].strip() title = title.replace('-钛媒体官方网站', '') content = ''.join( Selector(text=detail_info).xpath( '//div[@class="inner"]/p'). extract()) pattern = re.compile('</?a[^>]*>') content = pattern.sub('', content) pattern = re.compile('</?img[^>]*>') content = pattern.sub('', content) cover = response.meta['cover'] pdf = '' keywords = Selector( text=detail_info ).xpath( '//meta[@name="keywords"]/@content' ).extract()[0].strip() hot = response.meta['hot'] type = response.meta['type'] if type != '快讯': type = self.get_type(content) update = Selector( text=detail_info ).xpath( '//div[@class="post-info"]//span[@class="time "]/text()' ).extract()[0].strip() batch = self.batch table_name = 'spider.news' yield self.save_result( batch, url, title, content, cover, pdf, keywords, hot, type, update, table_name, response).load_item()
def google_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write(str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://www.google.com/" snippets = response.xpath("//div[@class='g']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_google.log", "a") as log_file: log_file.write(str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract() cite = Selector(text=snippet).xpath("//cite").extract() # cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath("//span[@class='st']").extract() if title.__len__() >= 2: title = title[0]+title[1] else: title="" if cite.__len__() > 0: # cite = cite[0].split("url?q=")[-1] cite = cite[0] for r in ['<cite>', '</cite>', '<b>', '</b>', '<cite class="kv">', '</cite class="kv">']: cite = cite.replace(r, '') else: cite = "" if text.__len__() > 0: text = text[0] for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>', '<span class="f">', '<span class="nobr">']: text = text.replace(r, '') else: text = "" if cite != "": if not cite.__contains__("facebook") and not cite.__contains__("youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("---------------------------------") self.log("--------------TITLE--------------") self.log(title) self.log("-------------CITE----------------") self.log(cite) self.log("---------------TEXT--------------") self.log(text) self.log("------------ID PERSON------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//td/b/text()").extract() self.log("-----------NUMBER OF PAGE-----") self.log(number[0] + "") if int(number[0]) < 6 and num_snippet < 15: res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.google_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request
def duck_selector(self, response): base_url = "https://duckduckgo.com/" snippets = response\ .xpath("//div[@class='result results_links results_links_deep web-result ']")\ .extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: storage_item = UsmItem() num_snippet = num_snippet + 1 title = Selector(text=snippet).xpath("//div/h2/a/node()").extract() cite = Selector(text=snippet).xpath("//div/a/@href").extract() text = Selector(text=snippet).xpath( "//div/a[@class='result__snippet']/node()").extract() if title.__len__() > 0: tmp = "" for text in title: for r in ["<b>", "</b>"]: text = text.replace(r, '') tmp = tmp + text title = tmp else: title = "" if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: tmp = "" for txt in title: for r in ["<b>", "</b>"]: txt = txt.replace(r, '') tmp = tmp + txt text = tmp else: text = "" if cite != "": self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("-----------ID PERSON-----------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)
def bing_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://www.bing.com/" snippets = response.xpath("//li[@class='b_algo']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_bing.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//h2/a/node()").extract() cite = Selector(text=snippet).xpath("//h2/a/@href").extract() text = Selector(text=snippet).xpath("//p").extract() tmp_title = "" for cad in title: tmp_title = tmp_title + cad for r in ["<strong>", "</strong>"]: tmp_title = tmp_title.replace(r, '') title = tmp_title if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: text = text[0] for r in [ "<p>", "</p>", "<strong>", "</strong>", '<span class="news_dt">', '</span>' ]: text = text.replace(r, '') else: text = "" if cite != "": if not cite.__contains__("facebook") and not cite.__contains__( "youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("----------ID PERSON------------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']" "//a[@class='sb_pagS']/text()").extract() self.log("-----------NUMBER OF PAGE-------") if number.__len__() > 0: self.log(number[0] + "") if int(number[0]) < 6 and num_snippet < 10: num = int(number[0]) + 1 num = str(num) res = response.xpath( "//li[@class='b_pag']/nav[@role='navigation']" "//a[@aria-label='Page " + num + "']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.bing_selector) request.meta['id_person'] = id_person request.meta['attr'] = base_attr request.meta['search'] = search request.meta['num_snip'] = num_snippet yield request
def duck_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://duckduckgo.com/" snippets = response \ .xpath("//div[@class='result results_links results_links_deep web-result ']") \ .extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_duckduckgo.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//div/h2/a/node()").extract() cite = Selector(text=snippet).xpath("//div/a/@href").extract() text = Selector(text=snippet).xpath( "//div/a[@class='result__snippet']/node()").extract() if title.__len__() > 0: tmp = "" for text in title: for r in ["<b>", "</b>"]: text = text.replace(r, '') tmp = tmp + text title = tmp else: title = "" if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: tmp = "" for txt in title: for r in ["<b>", "</b>"]: txt = txt.replace(r, '') tmp = tmp + txt text = tmp else: text = "" if cite != "" and num_snippet < 15: if not cite.__contains__("facebook") and not cite.__contains__( "youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("-----------ID PERSON-----------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)
def bing_selector(self, response): base_url = "https://www.bing.com/" snippets = response.xpath("//li[@class='b_algo']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: num_snippet = num_snippet + 1 storage_item = UsmItem() title = Selector(text=snippet).xpath("//h2/a/node()").extract() cite = Selector(text=snippet).xpath("//h2/a/@href").extract() text = Selector(text=snippet).xpath("//p").extract() tmp_title = "" for cad in title: tmp_title = tmp_title + cad for r in ["<strong>", "</strong>"]: tmp_title = tmp_title.replace(r, '') title = tmp_title if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: text = text[0] for r in ["<p>", "</p>", "<strong>", "</strong>"]: text = text.replace(r, '') else: text = "" if cite != "": self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("----------ID PERSON------------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']" "//a[@class='sb_pagS']/text()").extract() self.log("-----------NUMBER OF PAGE-------") if number.__len__() > 0: self.log(number[0] + "") if int(number[0]) < 5: num = int(number[0]) + 1 num = str(num) res = response.xpath( "//li[@class='b_pag']/nav[@role='navigation']" "//a[@aria-label='Page " + num + "']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.bing_selector) request.meta['id_person'] = id_person request.meta['attr'] = base_attr request.meta['search'] = search request.meta['num_snip'] = num_snippet yield request