def google_selector(self, response): base_url = "https://www.google.com.mx/" snippets = response.xpath("//div[@class='g']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: num_snippet = num_snippet + 1 storage_item = UsmItem() title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract() cite = Selector(text=snippet).xpath("//cite").extract() # cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath("//span[@class='st']").extract() if title.__len__() >= 2: title = title[0]+title[1] else: title="" if cite.__len__() > 0: # cite = cite[0].split("url?q=")[-1] cite = cite[0] for r in ['<cite>', '</cite>', '<b>', '</b>']: cite = cite.replace(r, '') else: cite="" if text.__len__() > 0: text = text[0] for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>']: text = text.replace(r, '') else: text = "" if cite != "": self.log("---------------------------------") self.log("--------------TITLE--------------") self.log(title) self.log("-------------CITE----------------") self.log(cite) self.log("---------------TEXT--------------") self.log(text) self.log("------------ID PERSON------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//td/b/text()").extract() self.log("-----------NUMBER OF PAGE-----") self.log(number[0] + "") if int(number[0]) < 6: res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.google_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request
def cite_selector(self, response): # Utils.create_page(Utils(), response.body, "-citeseerx") base_url = "http://citeseerx.ist.psu.edu/" snippets = response.xpath("//div[@class='result']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: storage_item = UsmItem() num_snippet = num_snippet + 1 title = Selector(text=snippet).xpath("//h3/a/node()").extract() # tmpTitle = Selector(text=snippet).xpath("//div[@class='pubinfo']") cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath( "//div[@class='snippet']/text()").extract() if title.__len__() > 0: tmp = "" for txt in title: for r in ['<em>', '</em>', '\n']: txt = txt.replace(r, '') tmp = tmp + txt title = tmp.strip() else: title = "" if cite.__len__() > 0: cite = base_url + cite[0] else: cite = "" if text.__len__() > 0: text = text[0] else: text = "" if cite != "": self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("------------ID PERSON----------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) num = response.xpath( "//div[@id='result_info']/strong/text()").extract() self.log("----------NUM OF ELEMENTS---------") self.log(num[0].split(' ')[2]) num = num[0].split(' ')[2] if int(num) < 60: url = response.xpath("//div[@id='result_info']" "/div[@id='pager']/a/@href").extract() self.log("------------URL TO FOLLOW ------------") if url.__len__() > 0: self.log(base_url + url[0]) request = Request(base_url + url[0], callback=self.cite_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request
def bing_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://www.bing.com/" snippets = response.xpath("//li[@class='b_algo']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_bing.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//h2/a/node()").extract() cite = Selector(text=snippet).xpath("//h2/a/@href").extract() text = Selector(text=snippet).xpath("//p").extract() tmp_title = "" for cad in title: tmp_title = tmp_title + cad for r in ["<strong>", "</strong>"]: tmp_title = tmp_title.replace(r, '') title = tmp_title if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: text = text[0] for r in [ "<p>", "</p>", "<strong>", "</strong>", '<span class="news_dt">', '</span>' ]: text = text.replace(r, '') else: text = "" if cite != "": if not cite.__contains__("facebook") and not cite.__contains__( "youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("----------ID PERSON------------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']" "//a[@class='sb_pagS']/text()").extract() self.log("-----------NUMBER OF PAGE-------") if number.__len__() > 0: self.log(number[0] + "") if int(number[0]) < 6 and num_snippet < 10: num = int(number[0]) + 1 num = str(num) res = response.xpath( "//li[@class='b_pag']/nav[@role='navigation']" "//a[@aria-label='Page " + num + "']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.bing_selector) request.meta['id_person'] = id_person request.meta['attr'] = base_attr request.meta['search'] = search request.meta['num_snip'] = num_snippet yield request
def google_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write(str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://www.google.com/" snippets = response.xpath("//div[@class='g']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_google.log", "a") as log_file: log_file.write(str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//a/b/text() | //a/text()").extract() cite = Selector(text=snippet).xpath("//cite").extract() # cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath("//span[@class='st']").extract() if title.__len__() >= 2: title = title[0]+title[1] else: title="" if cite.__len__() > 0: # cite = cite[0].split("url?q=")[-1] cite = cite[0] for r in ['<cite>', '</cite>', '<b>', '</b>', '<cite class="kv">', '</cite class="kv">']: cite = cite.replace(r, '') else: cite = "" if text.__len__() > 0: text = text[0] for r in ['<span class="st">', '</span>', '<br>', '</br>', '<b>', '</b>', '<span class="f">', '<span class="nobr">']: text = text.replace(r, '') else: text = "" if cite != "": if not cite.__contains__("facebook") and not cite.__contains__("youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("---------------------------------") self.log("--------------TITLE--------------") self.log(title) self.log("-------------CITE----------------") self.log(cite) self.log("---------------TEXT--------------") self.log(text) self.log("------------ID PERSON------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//td/b/text()").extract() self.log("-----------NUMBER OF PAGE-----") self.log(number[0] + "") if int(number[0]) < 6 and num_snippet < 15: res = response.xpath("//td[@class='b'][@style='text-align:left']/a[@class='fl']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.google_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request
def cite_selector(self, response): # Utils.create_page(Utils(), response.body, "-citeseerx") if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") # with open("count_citeseerx.txt", 'r') as file: # num = file.readline() # with open("count_citeseerx_error"+ str(num) +".txt", 'w') as file2: # file2.write(num) return base_url = "http://citeseerx.ist.psu.edu/" snippets = response.xpath("//div[@class='result']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_citeseer.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") if os.path.isfile('count_citeseerx.txt'): with open('count_citeseerx.txt', 'r') as file: num = file.readline() with open('count_citeseerx.txt', 'w') as file: file.write(str(int(num) + 1)) else: with open('count_citeseerx.txt', 'w') as file: file.write("0") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//h3/a/node()").extract() # tmpTitle = Selector(text=snippet).xpath("//div[@class='pubinfo']") cite = Selector(text=snippet).xpath("//h3/a/@href").extract() text = Selector(text=snippet).xpath( "//div[@class='snippet']/text()").extract() if title.__len__() > 0: tmp = "" for txt in title: for r in ['<em>', '</em>', '\n']: txt = txt.replace(r, '') tmp = tmp + txt title = tmp.strip() else: title = "" if cite.__len__() > 0: cite = base_url + cite[0] else: cite = "" if text.__len__() > 0: text = text[0] else: text = "" if cite != "": if not cite.__contains__("facebook") and not cite.__contains__( "youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("------------ID PERSON----------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) num = response.xpath( "//div[@id='result_info']/strong/text()").extract() if num == [] or num == ['No results found']: return try: self.log("----------NUM OF ELEMENTS---------") self.log(num[0].split(' ')[2]) num = num[0].split(' ')[2] # ToDo Add constant names # ['No results found'] if int(num) < 60 and num_snippet < 15: url = response.xpath("//div[@id='result_info']" "/div[@id='pager']/a/@href").extract() self.log("------------URL TO FOLLOW ------------") if url.__len__() > 0: self.log(base_url + url[0]) request = Request(base_url + url[0], callback=self.cite_selector) request.meta['id_person'] = id_person request.meta['search'] = search request.meta['attr'] = base_attr request.meta['num_snip'] = num_snippet yield request except: with open("error_num_citeseer.html", "w") as log_file: log_file.write(str(response.body))
def duck_selector(self, response): if response.status != self.STATUS_OK: with open("error.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") return base_url = "https://duckduckgo.com/" snippets = response \ .xpath("//div[@class='result results_links results_links_deep web-result ']") \ .extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] with open("system_duckduckgo.log", "a") as log_file: log_file.write( str(response.status) + " " + str(self.browser) + " " + str(search) + " " + str(num_snippet) + " " + datetime.today().strftime("%y-%m-%d-%H-%M") + "\n") for snippet in snippets: storage_item = UsmItem() title = Selector(text=snippet).xpath("//div/h2/a/node()").extract() cite = Selector(text=snippet).xpath("//div/a/@href").extract() text = Selector(text=snippet).xpath( "//div/a[@class='result__snippet']/node()").extract() if title.__len__() > 0: tmp = "" for text in title: for r in ["<b>", "</b>"]: text = text.replace(r, '') tmp = tmp + text title = tmp else: title = "" if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: tmp = "" for txt in title: for r in ["<b>", "</b>"]: txt = txt.replace(r, '') tmp = tmp + txt text = tmp else: text = "" if cite != "" and num_snippet < 15: if not cite.__contains__("facebook") and not cite.__contains__( "youtube"): text = Cleaner.clean_reserved_xml(Cleaner(), text) text = Cleaner.remove_accent(Cleaner(), text) title = Cleaner.clean_reserved_xml(Cleaner(), title) title = Cleaner.remove_accent(Cleaner(), title) if FeatureFilter.is_lang(text) == 'en': num_snippet = num_snippet + 1 self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("-----------ID PERSON-----------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)
def parse_myrg(self, response): black_list = ['1', ' ', '[...]', 'Recommendation', 'Download', 'Recommend', 'Follow', 'Share', 'Request full-text'] black_list2 = ['Referenced in the project:', 'Research from: ', ' '] itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for search_box in response.xpath('//div[contains(@class, "search-box__result-item")]'): storage_item = UsmItem() num_snippet = num_snippet + 1 text = ' '.join(list(set(search_box.css('span::text').extract()) - set(black_list)) + list( set(search_box.css('div::text').extract()) - set(black_list2))) url = response.url title_list = search_box.css('a.nova-e-link--theme-bare ::text').extract() try: if title_list[0] == 'Source': title = title_list[1] else: title = title_list[0] except: title = "NO TITLE WAS FOUND, YOUR SCRAPER SUCKS!" # print(url, title, text, "\n\n\n\n") self.log("---------------------------------") self.log("--------------TITLE--------------") self.log(title) self.log("-------------CITE----------------") self.log(url) self.log("---------------TEXT--------------") self.log(text) self.log("------------ID PERSON------------") self.log(id_person) self.log("------------SEARCH---------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = url storage_item['text'] = text storage_item['engine_search'] = self.browser storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)
def duck_selector(self, response): base_url = "https://duckduckgo.com/" snippets = response \ .xpath("//div[@class='result results_links results_links_deep web-result ']") \ .extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: storage_item = UsmItem() num_snippet = num_snippet + 1 title = Selector(text=snippet).xpath("//div/h2/a/node()").extract() cite = Selector(text=snippet).xpath("//div/a/@href").extract() text = Selector(text=snippet).xpath("//div/a[@class='result__snippet']/node()").extract() if title.__len__() > 0: tmp = "" for text in title: for r in ["<b>", "</b>"]: text = text.replace(r, '') tmp = tmp + text title = tmp else: title = "" if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: tmp = "" for txt in title: for r in ["<b>", "</b>"]: txt = txt.replace(r, '') tmp = tmp + txt text = tmp else: text = "" if cite != "": self.log("---------------------------------") self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("-----------ID PERSON-----------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self)
def bing_selector(self, response): base_url = "https://www.bing.com/" snippets = response.xpath("//li[@class='b_algo']").extract() itemproc = self.crawler.engine.scraper.itemproc id_person = response.meta['id_person'] base_attr = response.meta['attr'] search = response.meta['search'] num_snippet = response.meta['num_snip'] for snippet in snippets: num_snippet = num_snippet + 1 storage_item = UsmItem() title = Selector(text=snippet).xpath("//h2/a/node()").extract() cite = Selector(text=snippet).xpath("//h2/a/@href").extract() text = Selector(text=snippet).xpath("//p").extract() tmp_title = "" for cad in title: tmp_title = tmp_title + cad for r in ["<strong>", "</strong>"]: tmp_title = tmp_title.replace(r, '') title = tmp_title if cite.__len__() > 0: cite = cite[0] else: cite = "" if text.__len__() > 0: text = text[0] for r in [ "<p>", "</p>", "<strong>", "</strong>", '<span class="news_dt">', '</span>' ]: text = text.replace(r, '') else: text = "" if cite != "": self.log("------------TITLE----------------") self.log(title) self.log("------------CITE-----------------") self.log(cite) self.log("------------TEXT-----------------") self.log(text) self.log("----------ID PERSON------------------") self.log(id_person) self.log("-----------SEARCH----------------") self.log(search) self.log("--------------ATTR---------------") self.log(base_attr) self.log("-----------ENGINE SEARCH---------") self.log(self.browser) self.log("------------NUMBER SNIPPET-------") self.log(num_snippet) storage_item['title'] = title storage_item['cite'] = cite storage_item['text'] = text storage_item['id_person'] = id_person storage_item['search'] = search storage_item['attr'] = base_attr storage_item['engine_search'] = self.browser storage_item['number_snippet'] = num_snippet itemproc.process_item(storage_item, self) number = response.xpath("//li[@class='b_pag']/nav[@role='navigation']" "//a[@class='sb_pagS']/text()").extract() self.log("-----------NUMBER OF PAGE-------") if number.__len__() > 0: self.log(number[0] + "") if int(number[0]) < 5: num = int(number[0]) + 1 num = str(num) res = response.xpath( "//li[@class='b_pag']/nav[@role='navigation']" "//a[@aria-label='Page " + num + "']/@href").extract() for url in res: self.log("--URL TO FOLLOW--") self.log(base_url + url) request = Request(base_url + url, callback=self.bing_selector) request.meta['id_person'] = id_person request.meta['attr'] = base_attr request.meta['search'] = search request.meta['num_snip'] = num_snippet yield request