def _retrieve_phrase_data_subprocess(self, queue: Queue) -> list: """This method retrieves a list of phrases from an html document Returns: list: List of strings with phrases containing searched phrase """ # Set up a crawler process to use a spider runner = CrawlerRunner(self._crawler_meta) # Middleware between downloader and spider dispatcher.connect(self.crawler_results, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) # Apply requests from spider - Add arguments to initialize the spider if self.check_parameters(): try: defered = runner.crawl(self._spider_bot, self._parameter_dict) defered.addBoth(lambda _: reactor.stop()) reactor.run() queue.put(self._crawler_results) except Exception as e: queue.put(e) else: queue.put(None)
def spider_results(spidername, keywords, pagenum, sorttype): spider_class = None if spidername == 'bing': spider_class = BingSpider elif spidername == 'weixin': spider_class = SogouWxSpider elif spidername == 'weibo': spider_class = WeiboSpider elif spidername == 'baidu': spider_class = BaiduSpider elif spidername == 'baidunews': spider_class = BaidunewsSpider elif spidername == "ss_360": spider_class = Ss360Spider elif spidername == "ss_360_zx": spider_class = Ss360ZZSpider elif spidername == "chinaso": spider_class = ChinaSoSpider elif spidername == "chinaso_news": spider_class = ChinaSoNewsSpider else: return [] results = [] def crawler_results(signal, sender, item, response, spider): results.append(dict(item)) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess(get_project_settings()) process.crawl(spider_class, keywords=keywords, pagenum=pagenum, sorttype=sorttype) process.start() # the script will block here until the crawling is finished return json.dumps(results, ensure_ascii=False).encode('gbk', 'ignore').decode('gbk')
def test_parser_true(start_time, end_time, resolution, start_url, correct_res): res = [] def crawler_results(signal, sender, item, response, spider): """ help function for getting result when one page scrapped :param signal: :param sender: :param item: :param response: :param spider: :return: """ for x in item['urls']: res.append(x) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(WallpapersSpider, start_time=start_time, end_time=end_time, resolution=resolution, start_url=start_url) process.start() assert sorted(correct_res) == sorted(res)
def scrape_with_crochet(self, domain): """ signal fires when single item is processed and calls _crawler_result to save that item. Consider some synchronous do-one-thing-after-the-other application code that wants to use event-driven Twisted-using code. We have two threads at a minimum: the application thread(s) and the reactor thread. There are also multiple layers of code involved in this interaction Twisted code: Should only be called in reactor thread. This may be code from the Twisted package itself, or more likely code you have written that is built on top of Twisted. @wait_for/@run_in_reactor wrappers: The body of the functions runs in the reactor thread... but the caller should be in the application thread. The application code: Runs in the application thread(s), expects synchronous/blocking calls. dispatcher.connect will connect to the dispatcher that will kind of loop the code between these two functions. crawl_runner.crawl will connect to the our particular spider function based on the domain name, in our scrapy file and after each yield will pass to the crawler_result function. The setting.py is applied to the crawl runner. :param domain: the domain to crawl :return: a twisted.internet.defer.Deferred """ configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) crawler_settings = Settings() crawler_settings.setmodule(sets) self.crawl_runner.settings = crawler_settings dispatcher.connect(self._crawler_result, signal=signals.item_scraped) for i in self.dict_of_spiders: if i in domain: eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain) return eventual
def start_crawler(start_url, max_parsed_pages, num_processes, db): queue = multiprocessing.Queue() pool = [ multiprocessing.Process(target=queue_worker, args=(queue, db)) for _ in range(num_processes) ] for process in pool: process.start() def crawler_results(signal, sender, item, response, spider): """ help function for getting result when one page scrapped :param signal: :param sender: :param item: :param response: :param spider: :return: """ queue.put(item) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(AdvertisementScrapper, start_url=start_url, max_parsed_pages=max_parsed_pages) process.start()
def handle(self, *args, **options): if not options.get("period"): target_date = date.today() + relativedelta(months=-2) target_date = target_date.strftime("%m/%Y") else: target_date = options.get("period") dispatcher.connect(self.save, signal=signals.item_passed) os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings" settings = get_project_settings() settings["COOKIES_ENABLED"] = True if options.get("scrapy_args"): scrapy_args = json.loads(options.get("scrapy_args")) settings.update(scrapy_args) process = CrawlerProcess(settings=settings) args = { "unidade": options.get("unit"), "competencia": target_date, "cidade": "feira de santana", "periodicidade": options.get("period_type"), } self.warn(str(args)) process.crawl(ConsultaPublicaSpider, **args) self.warn("Iniciando a coleta dos documentos do TCM-BA...") process.start() self.success("Pronto!")
def spider_results(): results = [] settings = Settings() os.environ['SCRAPY_SETTINGS_MODULE'] = 'LyricsFinder.LyricsFinder.settings' settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE'] settings.setmodule(settings_module_path, priority='project') process = CrawlerProcess(settings) def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_passed) query = '' name_split = args['song_name'].split() for name in name_split[:-1]: query += name + '+' if (args['singer']): query += name_split[-1] + '+by' + '+' singer_split = args['singer'].split() for name in singer_split[:-1]: query += name + '+' query += singer_split[-1] + '+lyrics' + '+-site:youtube.com' else: query += name_split[-1] + '+lyrics' + '+-site:youtube.com' # print(query) process.crawl(LyricsFinderSpider, start_urls=["https://www.google.com/search?q=" + query]) process.start() return results
def handle(self, *args, **options): if options.get("drop_all"): self.warn("Dropping existing records...") CityCouncilAgenda.objects.all().delete() CityCouncilAttendanceList.objects.all().delete() if os.getenv("FEATURE_FLAG__SAVE_GAZETTE", False): Gazette.objects.all().delete() GazetteEvent.objects.all().delete() dispatcher.connect(self.save, signal=signals.item_passed) os.environ["SCRAPY_SETTINGS_MODULE"] = "scraper.settings" process = CrawlerProcess(settings=get_project_settings()) process.crawl( AgendaSpider, start_from_date=CityCouncilAgenda.last_collected_item_date(), ) process.crawl(AttendanceListSpider) if os.getenv("FEATURE_FLAG__SAVE_GAZETTE", False): last_collected_gazette = Gazette.last_collected_item_date() if last_collected_gazette is None: process.crawl(LegacyGazetteSpider) process.crawl( ExecutiveAndLegislativeGazetteSpider, start_from_date=last_collected_gazette, ) process.start() self.success("Done!")
def scrape_with_crochet(baseURL): # This will connect to the dispatcher that will kind of loop the code between these two functions. dispatcher.connect(_crawler_result, signal=signals.item_scraped) # This will connect to the ReviewspiderSpider function in our scrapy file and after each yield will pass to the crawler_result function. eventual = crawl_runner.crawl(ReviewspiderSpider, category=baseURL) return eventual
def run_crawler2(q): print('run_crawler') def close(): q.put('close') print('CLOSE') def scraped(): q.put('scraped') print('SCRAPED') try: print('run_crawler') crawler_settings = get_project_settings() runner = CrawlerRunner(crawler_settings) dispatcher.connect(close, signal=signals.spider_closed)#'item_scraped' dispatcher.connect(scraped, signal=signals.item_scraped)#'item_scraped' deferred = runner.crawl(InfoempleoSpider) deferred.addBoth(lambda _: reactor.stop()) print('reactor...') q.put('reactor...') reactor.run() print('run!!!!!') q.put('run') except Exception as e: print(e) q.put(e)
def scrape_paytm_with_crochet(retailer_id, search_string, category_name): dispatcher.connect(_crawler_result, signal=signals.item_scraped) eventual = crawl_runner.crawl(PaytmscraperSpider, retailer_id=retailer_id, search_string=search_string, category_name=category_name) return eventual
def scrape_with_crochet(post_form, post_head): dispatcher.connect(_crawler_result, signal=signals.item_scraped) eventual = crawler_runner.crawl(OjkCFS_Spider, req_head=post_head, req_form=post_form) # dispatcher.connect(_crawler_stop, signals.engine_stopped) return eventual
def scrape_croma_with_crochet(retailer_id, search_string, category_name): dispatcher.connect(_crawler_result, signal=signals.item_scraped) print(f"Croma retailer ID {retailer_id}") eventual = crawl_runner.crawl(CromascraperSpider, retailer_id=retailer_id, search_string=search_string, category_name=category_name) return eventual
def run(self): """ Starting client and scrapping jobs. And then get results from scrapping (url list of images) and start processes in pool for downloading and storing non-duplicate images. :return: self """ if self.hashes is None: logging.error(f'prepare() function was not called before') return None # results = [] queue = multiprocessing.Queue() pool = [ multiprocessing.Process(target=self._queue_worker, args=(queue, )) for _ in range(self.num_processes) ] for process in pool: process.start() # pool = multiprocessing.Pool(self.num_processes, self._worker_main, (queue,)) def crawler_results(signal, sender, item, response, spider): """ help function for getting result when one page scrapped :param signal: :param sender: :param item: :param response: :param spider: :return: """ # results.append(item) for x in item['urls']: queue.put(x) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) process.crawl(WallpapersSpider, start_time=self.start_time, end_time=self.end_time, resolution=self.resolution, start_url=self.BASE_URL) logging.getLogger('scrapy').setLevel(logging.ERROR) process.start() for _ in range(self.num_processes): queue.put('STOP') # results = [x for res in results for x in res['urls']] # logging.info(f'ALL IMAGES URLS: {", ".join(results)}') # with multiprocessing.Pool(self.num_processes) as pool: # pool.map(self._process_urls, results) for process in pool: process.join() return self
def __init__(self): self.logger.info('Lagou webdrive start') super(LagouSpider, self).__init__() chrome_opt = webdriver.ChromeOptions() pref = {"profile.managed_default_content_settings.images": 2} chrome_opt.add_experimental_option("prefs", pref) self.browser = webdriver.Chrome(executable_path=CHROME_PATH, chrome_options=chrome_opt) dispatcher.connect(self.spider_close, signals.spider_closed)
def f(return_list): def collect_items(signal, sender, item, response, spider): return_list.append(item) dispatcher.connect(collect_items, signal=signals.item_passed) runner = crawler.CrawlerRunner() deferred = runner.crawl(PlantInfoSpider, url=returned_url) deferred.addBoth(lambda _: reactor.stop()) reactor.run()
def scrape_amazon_with_crochet(retailer_id, search_string, category_name): # This will connect to the dispatcher that will kind of loop the code between these two functions. dispatcher.connect(_crawler_result, signal=signals.item_scraped) print(f"Amazon retailer ID {retailer_id}") # This will connect to the ReviewspiderSpider function in our scrapy file and after each yield will pass to the crawler_result function. eventual = crawl_runner.crawl(AmazonscraperSpider, retailer_id=retailer_id, search_string=search_string, category_name=category_name) return eventual
def scrape_with_crochet(): """ Deferete function who permit to call the crawler to fetch the articles. The crawler is launched in an asynchronous processus. """ # signal fires when single item is processed # and calls _crawler_result to append that item dispatcher.connect(_crawler_result, signal=signals.item_scraped) eventual = crawl_runner.crawl(ArticlesSpider) return eventual # returns a twisted.internet.defer.Deferred
def handle(self, *args, **options): if options.get("drop_all"): self.warn("Dropping existing records...") Kid.objects.all().delete() dispatcher.connect(self.save, signal=signals.item_passed) process = CrawlerProcess(settings={"LOG_LEVEL": "INFO"}) process.crawl(ParanaSpider) process.start() self.success("Done!")
def spider_results(site, project='renault', out_file='out.json'): """ Wrapper for launching Scrapy. Parameters : site : str Name of the site we are scraping from. project : str Name of the project we are working on. The default is 'renault' out_file : str Name of the file where we want to save the result. The default is out.json Returns : List of items (dictionaries) processed by the scraper """ if project == 'renault': from broad_crawl_spider import MySpider elif project == 'iterative': from iterative_spider import MySpider else: print('No spider for project:', project) return results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_passed) # Scrapy default_settings are overridden by below rules settings = get_project_settings() settings['ROBOTSTXT_OBEY'] = True settings['LOG_LEVEL'] = 'CRITICAL' settings['FEED_FORMAT'] = 'json' settings['FEED_URI'] = 'file:../output/%s/store.json' % site settings['CLOSESPIDER_ITEMCOUNT'] = 2000 # 250000 for production use # 1000 to 5000 for testing settings['HTTPERROR_ALLOWED_CODES'] = [301] ''' If you get HTTP error 403 - change USER_AGENT To activate Selenium use below setting: DOWNLOADER_MIDDLEWARES = { 'mobility.mobility.scraper.code.selenium_mid.SeleniumMiddleware': 500 } ''' process = CrawlerProcess(settings) process.crawl(MySpider) process.start( ) # the script will block here until the crawling is finished return results
def run_proc(name, q): results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_scraped) process = CrawlerProcess(get_project_settings()) process.crawl(ArticleSpider, start_urls=[name]) process.start() q.put(results[0])
def scrape_with_crochet(self, domain): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) crawler_settings = Settings() crawler_settings.setmodule(sets) self.crawl_runner.settings = crawler_settings dispatcher.connect(self._crawler_result, signal=signals.item_scraped) for i in self.dict_of_spiders: if i in domain: eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain) return eventual
def spider_results(): results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess(get_project_settings()) process.crawl(CLSpider) process.start() return results
def spider_results(): results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess(get_project_settings()) process.crawl(GIINSpider) process.start() # the script will block here until the crawling is finished return results
def __init__(self, spider): def increment_count(cls): print('incrementing count') cls.count = cls.count + 1 dispatcher.connect(lambda _: print('FINIsh'), signal=signals.spider_closed) dispatcher.connect(increment_count, signal=signals.item_passed) settings = get_project_settings() self.process = CrawlerProcess(settings) self.spider = spider
def scrape(urls): mapping_ShareX = ["pixl.is", "putme.ga", "putmega.com"] mapping_Chibisafe = [ "cyberdrop.me", "cyberdrop.cc", "cyberdrop.to", "bunkr.is", "bunkr.to" ] mapping_GoFile = ["gofile.io"] replacements = [('fs-...', ''), ('img-...', ''), ('i\.', ''), ('stream.', ''), ('www.', '')] ShareX_urls = [] Chibisafe_urls = [] GoFile_urls = [] unsupported_urls = [] cookies = [] result_links = OrderedDict() for url in urls: base_domain = urlparse(url).netloc for old, new in replacements: base_domain = re.sub(old, new, base_domain) if base_domain in mapping_ShareX: ShareX_urls.append(url) elif base_domain in mapping_Chibisafe: Chibisafe_urls.append(url) elif base_domain in mapping_GoFile: GoFile_urls.append(url) else: unsupported_urls.append(url) def crawler_results(signal, sender, item, response, spider): domain = sanitize_key(item['netloc']) title = re.sub(r'[\\/*?:"<>|.]', "-", item['title']) referal = item['referal'] url = item['url'] cookies.extend(x for x in item['cookies'] if x not in cookies) result_links.setdefault(domain, OrderedDict()).setdefault( title, []).append([url, referal]) dispatcher.connect(crawler_results, signal=signals.item_scraped) settings = get_project_settings() settings.set('LOG_LEVEL', logging.CRITICAL) process = CrawlerProcess(settings) if ShareX_urls: process.crawl(ShareX_Spider, myurls=ShareX_urls) if Chibisafe_urls: process.crawl(ChibisafeSpider, myurls=Chibisafe_urls) if GoFile_urls: process.crawl(GoFileSpider, myurls=GoFile_urls) process.start() return cookies, result_links
def f(q): try: crawler_settings = get_project_settings() runner = CrawlerRunner(crawler_settings) dispatcher.connect(lambda _: print('finish'), signal=signals.spider_closed)#'item_scraped' dispatcher.connect(lambda _: print('item scraped'), signal=signals.item_scraped)#'item_scraped' deferred = runner.crawl(InfoempleoSpider) deferred.addBoth(lambda _: reactor.stop()) print('reactor...') reactor.run() print('run!!!!!') q.put(None) except Exception as e: q.put(e)
def get_data(): results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) dispatcher.connect(crawler_results, signal=signals.item_passed) process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(ElectronicsSpider) process.start() return results
def spider_handler(latitude, longitude, max_number, q): link = get_link_for_tripadvisor(latitude, longitude) output = [] _exporter = PythonItemExporter(binary=False) def get_crawler_output(signal, sender, item, response, spider): output.append(_exporter.export_item(item)) dispatcher.connect(get_crawler_output, signal=signals.item_scraped) process = CrawlerProcess({ "USER_AGENT": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)" }) process.crawl(RestaurantSpider, start_url=link, max_restaurants=max_number) process.start() q.put(output)
def index(self): try: self.debounce() except DebounceError as e: log.error("Debounced indexing task: %s", e) return docs_to_process = Queue() Spider = type( 'Spider', (DocumentationSpiderBase,), {"url": self.url, "validators": self.site.validators, "allow": self.site.allow, "deny": self.site.deny}) def enqueue_document(signal, sender, item: SearchDocument, response, spider): """Queue a SearchDocument for indexation.""" docs_to_process.put(item) def index_documents(): while True: doc: SearchDocument = docs_to_process.get() try: self.index_document(doc) except Exception as e: log.error("Unexpected error while indexing doc %s, error: %s", doc.doc_id, e) docs_to_process.task_done() def start_indexing(): if docs_to_process.empty(): return self.search_client.redis.set( keys.last_index(self.site.url), datetime.datetime.now().timestamp()) docs_to_process.join() for _ in range(MAX_THREADS): Thread(target=index_documents, daemon=True).start() dispatcher.connect(enqueue_document, signal=signals.item_scraped) dispatcher.connect(start_indexing, signal=signals.engine_stopped) process = CrawlerProcess(settings={ 'CONCURRENT_ITEMS': 200, 'CONCURRENT_REQUESTS': 100, 'CONCURRENT_REQUESTS_PER_DOMAIN': 100, 'HTTP_CACHE_ENABLED': True, 'REACTOR_THREADPOOL_MAXSIZE': 30, 'LOG_LEVEL': 'ERROR' }) process.crawl(Spider) process.start()