async def recrawl(): """Gets the latest matches and inserts them into the database.""" print("getting recent matches") api = crawler.Crawler() # TODO: insert API version (force update if changed) # TODO: create database indices # get or put when the last crawl was executed # crawl and upsert for region in ["na", "eu"]: try: last_match_update = (await db.select( """ SELECT data->'attributes'->>'createdAt' AS created FROM match WHERE data->'attributes'->>'shardId'='""" + region + """' ORDER BY data->'attributes'->>'createdAt' DESC LIMIT 1 """) )[0]["created"] except: last_match_update = "2017-02-05T01:01:01Z" matches = await api.matches_since(last_match_update, region=region) if len(matches) > 0: print(region + " got a lot new data items: " + str(len(matches))) else: print(region + " got no new matches.") await db.upsert(matches, True) asyncio.ensure_future(recrawl_soon())
def downloadFile(self): crawl = crawler.Crawler(self.fileNameUrls) lista = crawl.crawlFile() for video in lista: aux = crawler.Crawler.downloadItem(video) if (aux != None): lsedatasetBuild.buildPoseFile(aux)
def dispatcher_q(): _crawler = None try: q = Queue() _crawler = crawler.Crawler(q, callback=emit_flight_info, driver_path=config["driver_path"], driver_type=config["driver_type"], page_wait_interval=int(config["page_wait_interval"])) _crawler.daemon = True _crawler.start() while True: if len(flight_qs) == 0: time.sleep(1) continue cnt = 0 ids = [x for x in flight_qs.keys()] for i in ids: f = flight_qs[i] if f["in_progress"] is False \ and f["deleted"] is False \ and f["updated_at"] + int(config["refresh_interval"]) <= int(time.time() * 1000): f["in_progress"] = True q.put(f) cnt += 1 deleted = [f["id"] for f in flight_qs.values() if f["deleted"]] for d in deleted: flight_qs.pop(d, None) if cnt == 0: time.sleep(1) finally: _crawler.stop()
def test_exclude(self): crawler = C.Crawler(['http://example.com'], exclude=r'.*pattern', loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://example.com")) self.assertFalse(crawler.url_allowed("http://example.com/pattern"))
def test_lenient_host_checking(self): crawler = C.Crawler(['http://example.com'], strict=False, loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://www.example.com")) self.assertTrue(crawler.url_allowed("http://foo.example.com"))
async def crawl_region(region): """Gets some matches from a region and inserts them until the DB is up to date.""" api = crawler.Crawler() while True: try: last_match_update = (await db.select(""" SELECT data->'attributes'->>'createdAt' AS created FROM match WHERE data->'attributes'->>'shardId'='""" + region + """' ORDER BY data->'attributes'->>'createdAt' DESC LIMIT 1 """))[0]["created"] except: last_match_update = "2017-02-05T01:01:01Z" print(region + " fetching matches after " + last_match_update) # wait for http requests matches = await api.matches_since(last_match_update, region=region, params={"page[limit]": 50}) if len(matches) > 0: print(region + " got new data items: " + str(len(matches))) else: print(region + " got no new matches.") return # insert asynchronously in the background await db.upsert(matches, True)
def GetFeedInfo(url): c = crawler.Crawler('') rss = c.download(url) ret = [] if len(rss) < 20: return ret try: dom = xml.dom.minidom.parseString(str.strip(rss)) items = dom.getElementsByTagName('item') title = '' link = '' pub_date = '' for item in items: title_node = item.getElementsByTagName('title') if len(title_node) > 0: title = title_node[0].firstChild.data link_node = item.getElementsByTagName('link') if len(link_node) > 0: link = link_node[0].firstChild.data date_node = item.getElementsByTagName('pubDate') if len(date_node) > 0: pub_date = date_node[0].firstChild.data pdate = GetDate(pub_date) itemxml = item.toxml() if pdate > 0: ret.append([title, link, pdate, itemxml]) return ret except xml.parsers.expat.ExpatError, e: return ret
def search_leboncoin(rabbit_channel): leboncoin = crawler.Crawler() for offer in leboncoin.offers(): logging.info('Found offer "%s" -- %s', offer['title'], offer['identifier']) rabbit_channel.basic_publish(exchange='', routing_key=OFFERS_QUEUE, body=json.dumps(offer), properties=pika.BasicProperties( delivery_mode = 2, # make message persistent ))
def batch_test(): import crawler logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG) cc = crawler.Crawler() cc.add_headers({ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}) hc, url, header, page = cc.get_page('http://my.tv.sohu.com/us/241816781/80597444.shtml') logging.info('%s %s %s' % (hc, url, header))
def get_listings(): keywords = request.args.get('keywords') category = request.args.get('category') days = request.args.get('days') crawl = c.Crawler(category, keywords, days) listings = vars(crawl)['complete_list'] listings = json.dumps(listings) return listings
def crawl(self, urls=None, *args, **kwargs): if self.crawler: self.crawler.close() if urls is None: urls = [self.app_url] self.crawler = C.Crawler(urls, *args, loop=self.loop, **kwargs) self.addCleanup(self.crawler.close) self.loop.run_until_complete(self.crawler.crawl())
def test_roots(self): crawler = C.Crawler(['http://a', 'http://b', 'not-a-host'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(crawler.url_allowed("http://a/a")) self.assertTrue(crawler.url_allowed("http://b/b")) self.assertFalse(crawler.url_allowed("http://c/c")) self.assertFalse(crawler.url_allowed("http://127.0.0.1"))
def index(self, *args, **kwargs): data = {} if kwargs.get('search'): c = crawler.Crawler() r = c.fetch(kwargs['search']) data['recipes'] = r data['search'] = kwargs['research'] mytemplate = Template(filename='search.html') return mytemplate.render(**data)
def __init__(self): self.taobao_crawler = crawler.Crawler() cur_path = os.path.split(os.path.realpath(__file__))[0] config_path = cur_path + os.path.sep + "config.json" json_file = open(config_path) self.taobao_config = json.load(json_file)['taobao'] # 登录淘宝,这里用微博用户名登录,需要首先在淘宝网页上进行绑定,使用之后请将自己的用户名密码删除 weibo_username = "" # 默认: username weibo_password = "" # 默认:password self.taobao_crawler.login_taobao(weibo_username, weibo_password)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) domain = "https://www.epocacosmeticos.com.br" self.crawler = cw.Crawler( domain, req_limit=10, greedy=False, indentify_target=lambda page: page.valid_target) self.crawler.run(10)
def do_crawl(crawler_params): """Starts and runs a crawler""" user_id = g.user.user_id def crawler_callback(recipe, app): with app.app_context(): add_recipe_to_db(full_content=recipe, uploader_id=user_id, src_url=crawler_params["base_url"]) crawler_params["recipe_callback"] = crawler_callback crawler_params["recipe_callback_args"] = (current_app._get_current_object(),) crawler_params["recipe_callback_kwargs"] = {} crawler.Crawler(**crawler_params)
def test_crawler_recurses_into_discovered_links(getter): spider = crawler.Crawler("https://www.example.com", getter) spider.start(iterations=2) assert spider.visited_links == [ "https://www.example.com", "https://www.touchsurgery.com/1", "https://www.touchsurgery.com/2", ]
def test_crawler_handles_discovering_relative_urls(getter, request): spider = crawler.Crawler("https://www.example.com", getter) spider.start(iterations=2) assert spider.visited_links == [ "https://www.example.com", "https://www.example.com/1", "https://www.example.com/1/2", ]
def DoCrawler(message): print "DO CRAWLER MESSAGE : "+message import crawler Jconf = json.loads(message) RunnerID = Jconf["RunnerID"] RunnerList = Jconf["RunnerList"] JobID = Jconf["JobID"] JobOwner = Jconf["JobOwner"] client.JobDict[JobID] = Jconf Cclass = crawler.Crawler(JobID, RunnerID, RunnerList, JobOwner) Cclass.Run()
def hello_world(): url = request.form['url'] depth = request.form['depth'] logger.info("Recieved request for url: " + url + " depth: " + depth) c = crawler.Crawler(url, int(depth)) try: site_map = c.crawl() except: logger.error("Error while crawling", exc_info=True) return json.dumps('{ "message" : "Server error"}') return jsonify(dict(site_map))
def get_big_boy_from_session(intent, session): session_attributes = ses_att reprompt_text = "" speech_output = "" try: crawl = crawler.Crawler() results = crawl.getLaundryData(session_attributes['laundryData'], "") for result in results: speech_output = speech_output + result[0] + " has " + result[1] + " washers " + result[2] + " dryers available. " except Exception as e: speech_output = str(e) should_end_session = False return build_response(session_attributes, build_speechlet_response(intent['name'], speech_output, reprompt_text, should_end_session))
def test_full_fld_crawl2(): url = 'google.com' fld = utility.get_fld(url) setup_db = p.database.setup_database() spider = crawler.Crawler(fld, p) spider.extractor.robots.rules["Disallow"].append("\S+/Partier/\S+") spider.extractor.robots.rules["Disallow"].append("/\S+.html") spider.extractor.robots.rules["Disallow"].append("/javascript") spider.extractor.robots.rules["Disallow"].append("\S+.cbv") spider.extractor.robots.rules["Disallow"].append("\S+2014") spider.extractor.robots.rules["Disallow"].append("\S+beta") print('Starting crawling') spider.start_crawling()
def compile_active_list(file): dict = {} rosie = crawler.Crawler() rosie.crawl_nodes_api() list_of_active_nodes = [x[0].split('/')[4] for x in rosie.node_url_tuples] dict['list_of_active_nodes'] = list_of_active_nodes rosie.crawl_users_api() list_of_active_users = [x.split('/')[4] for x in rosie.user_urls] dict['list_of_active_users'] = list_of_active_users rosie.crawl_registrations_api() list_of_active_registrations = [ x[0].split('/')[3] for x in rosie.registration_url_tuples ] dict['list_of_active_registrations'] = list_of_active_registrations json.dump(dict, file, indent=4)
def test_full_fld_crawl(): """ This is just a short website, that my bot can crawl through it's entirety fast, so I can use it as test """ url = 'vg.no' fld = utility.get_fld(url) setup_db = p.database.setup_database() spider = crawler.Crawler(fld, p) spider.extractor.robots.rules["Disallow"].append("\S+/Partier/\S+") spider.extractor.robots.rules["Disallow"].append("/\S+.html") spider.extractor.robots.rules["Disallow"].append("/javascript") spider.extractor.robots.rules["Disallow"].append("\S+.cbv") spider.extractor.robots.rules["Disallow"].append("\S+2014") spider.extractor.robots.rules["Disallow"].append("\S+beta") print('Starting crawling') spider.start_crawling()
def main(): """Main program. Parse arguments, set up event loop, run crawler, print report. """ args = ARGS.parse_args() if not args.roots: print('Use --help for command line help') return levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] logging.basicConfig(level=levels[min(args.level, len(levels) - 1)]) if args.iocp: from asyncio.windows_events import ProactorEventLoop loop = ProactorEventLoop() asyncio.set_event_loop(loop) elif args.select: loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) else: loop = asyncio.get_event_loop() roots = {fix_url(root) for root in args.roots} c = crawler.Crawler( roots, exclude=args.exclude, strict=args.strict, max_redirect=args.max_redirect, max_tries=args.max_tries, max_tasks=args.max_tasks, ) try: loop.run_until_complete(c.crawl()) # Crawler gonna crawl. except KeyboardInterrupt: sys.stderr.flush() print('\nInterrupted\n') finally: reporter.report(c) c.close() # next two lines are required for actual aiohttp resource cleanup loop.stop() loop.run_forever() loop.close()
def main(self): dict_arg = {} dict_arg['skipext'] = [] dict_arg['num_workers'] = 1 dict_arg['parserobots'] = False dict_arg['debug'] = False dict_arg['verbose'] = False dict_arg['exclude'] = [] dict_arg['drop'] = [] dict_arg['report'] = False dict_arg['images'] = False dict_arg['domain'] = self.domain dict_arg['output'] = self.output crawl = crawler.Crawler(**dict_arg) crawl.run() pass
def rack_url(model_url,shop_No,model_name):#店の特定機種のすべてのurl回収 return(machine_URL,machine_No,machine_name) print "台データの回収開始.....{0}".format(model_name.encode("utf-8")) scraping = crawler.Crawler() bs = scraping.scraping(model_url) root_url = sd.Shop_data(shop_No).root_url url_data = bs.table.find_all("a",class_="btn-base") all_url = [] for i in url_data: No = i.string url = root_url + i.get("href") all_url.append((url,No,model_name)) return all_url
def lineReceived(self, line): self.sendLine('Echo: ' + line) if line == 'test': self.massTest() if line == 'check': self.massCheck() if line == 'index': cr = crawler.Crawler() freq = cr.grabFromPage('http://habrahabr.ru', 1) dg.shareIndex2(self, freq) if 'search' in line: q = line[len('search'):] dg.activate2(self, q) #self.getValue('123') #self.getValue('key') #self.getValue('123') self.transport.write('>>> ')
def ch_update(): update_file = 'log_file/update.txt' crawer = crawler.Crawler('http://www.wandoujia.com/category/app',update_file) crawer.crow() past = open('log_file/urls.txt','r') now = open('log_file/update.txt','r') past_list = past.readlines() now_list = now.readlines() if len(now_list)>len(past_list): past.close() now.close() print u'有更新' return True else: past.close() now.close() print u'没有更新' return False
def get_welcome_response(): """ If we wanted to initialize the session to have some attributes we could add those here """ web_crawler = crawler.Crawler() expBal, flexBal, swipesBal = web_crawler.checkCash()[0], web_crawler.checkCash()[1], web_crawler.checkCash()[2] session_attributes = {"expBal": expBal, "flexBal": flexBal, "swipesBal": swipesBal} #Exp #Flex #Swipes card_title = "Welcome" speech_output = "Welcome to the Aflexa, your personal William and Mary services assistant. " \ "Would you like to check your flex or express balance, meal swipes, or laundry machines?" \ # If the user either does not reply to the welcome message or says something # that is not understood, they will be prompted again with this text. reprompt_text = "Would you like to check your flex or express balance, meal swipes, or laundry machines?" should_end_session = False return build_response(session_attributes, build_speechlet_response( card_title, speech_output, reprompt_text, should_end_session))