def run(self): start_mark = False crawler = Crawler() while self.flag: try: url = 'http://hw.venndata.cn/proxy?num={num}'.format( num=self.num) response, _ = crawler.crawl(url=url) html = response.text if html: data = json.loads(html)['data'] proxy_list = data.split('|') if len(proxy_list) > 500: old_len = len(PROXY_LIST) PROXY_LIST.extend(proxy_list) PROXY_LIST[0:old_len] = [] if not start_mark: log.critical("代理启动成功!获取代理%s" % len(proxy_list)) start_mark = True except Exception as e: log.error('生成 Proxy Failed' + str(e)) time.sleep(self.interval) log.info('代理关闭') return
def fill_empty_restaurant(self, item): # It there is a a dict, then add if item: print("Attempting to add empty restaurants to DB") p = ThreadPool(20) try: # for key, value in self.empty_restaurant.iteritems(): for key, value in item.iteritems(): p.apply_async(self.upload_restaurant_item, [value]) except AttributeError as ae: # remove key if the restaurant already exists print(ae.args[0]) item.pop(ae.args[0], None) print(item) except Exception as e: # Sometimes this will throw, but it will execute anyways print(e) p.close() p.join() # self.upload_restaurant_list(self.empty_restaurant) print("Attempting to add food through YelpApi") # p.apply_async(self.upload_food_list(Crawler(self.uninserted_restaurant).query(10))) t = threading.Thread( name="Crawler", target=self.upload_food_list( Crawler(self.uninserted_restaurant).query(10))) t.start() t.join()
async def valid_url(): url = 'http://yoyowallet.com' urls = [ 'http://yoyowallet.com', 'http://yoyowallet.com/', 'http://yoyowallet.com/about.html', 'http://yoyowallet.com/assets.html', 'http://yoyowallet.com/banks/index.html', 'http://yoyowallet.com/basket-data.html', 'http://yoyowallet.com/careers.html', 'http://yoyowallet.com/case-studies/caffe-nero-case-study.html', 'http://yoyowallet.com/caterers/index.html', 'http://yoyowallet.com/cookies.html', 'http://yoyowallet.com/epos.html', 'http://yoyowallet.com/get-in-touch.html', 'http://yoyowallet.com/retailers/index.html'] all_urls = { 'http://yoyowallet.com/about.html', 'http://yoyowallet.com/case-studies/caffe-nero-case-study.html', 'http://yoyowallet.com/careers.html', 'http://yoyowallet.com/banks/index.html', 'http://yoyowallet.com', 'http://yoyowallet.com/assets.html', 'http://yoyowallet.com/caterers/index.html', 'http://yoyowallet.com/cookies.html', 'http://yoyowallet.com/', 'http://yoyowallet.com/basket-data.html', 'http://yoyowallet.com/epos.html', 'http://yoyowallet.com/retailers/index.html', 'http://yoyowallet.com/get-in-touch.html'} crawler = Crawler(url) _url, data, _urls, _all_urls = await crawler.extract(url) self.assertEqual(_url, url) self.assertListEqual(_urls, urls) self.assertSetEqual(_all_urls, all_urls)
def run(self): print("开始") self.int() list_csv = self.get_parameter() wether_url_list = [] for city in list_csv: cityid = self.city_search(city) wether_url = self.wether_data(cityid) wether_url_list.append(wether_url) for li in wether_url_list: self.url_queue.put(li) # print("获取队列中的对象", self.url_queue.get()) crawlers = [ Crawler(self.url_queue, self.page_parse, self.save_data, self.generate_page) for _ in range(0, self.coro_num) ] loop = asyncio.get_event_loop() to_do = [ crawlers[coro_id].asyn_crawl(coro_id) for coro_id in range(0, self.coro_num) ] wait_coro = asyncio.wait(to_do) loop.run_until_complete(wait_coro) loop.run_until_complete(asyncio.sleep(5.25)) loop.close()
def crawl(base: str): clr = Crawler(base) pages = clr.crawl(depth=0) pages = { 'urls': pages, } results = json.dumps(pages) return jsonify(results)
def main(argv): '''Demonstrate it working by printing out results''' a = Crawler() a.init_browser() html = a.getContent(argv[0]) d = PageParse(html, argv[0]) data = d.socialmedia() a.exit() print(data)
def main(spider_class, settings_module): crawler = Crawler(spider_class, settings_module) try: joinall([spawn(crawler.process_url) for i in xrange(5)]) except KeyboardInterrupt: crawler.clear(False) except: logger.exception("Unable to complete") else: crawler.clear(True) logger.info("Crawling completed")
def media_changed(self, media): """ Media mountpoint changed or added. """ for id, client, monitors in self.clients: client.rpc('device.changed', media.id, media.prop) if not media.crawler: if not media.get('block.device'): log.info('start crawler for /') media.crawler = Crawler(self._db, use_inotify=True) self._db.signals['changed'].emit([media._beacon_id])
def get(self, cycle, *a): """ 定时获取代理 """ getter = Crawler() while True: info('[调度系统] 开始抓取代理') getter.run() if config["crawl"]["checkmax"]: if len(DB().getall()) >= config["crawl"]["maxvalue"]: info("[调度系统] 代理池已达上限,停止抓取代理") while len(DB().getall()) >= config["crawl"]["maxvalue"]: pass time.sleep(int(cycle))
def run(self): print("开始") self.int() data = self.parameter_json(self.payload) all_type_list = list() for li in data: id = li['id'] name = li['name'] print("1级分类", id, name) self.payload['id'] = id data_2 = self.parameter_json(self.payload) for li_2 in data_2: id_2 = li_2['id'] name_2 = name + '&' + li_2['name'] print("2级分类", id_2, name_2) self.payload['id'] = id_2 data_3 = self.parameter_json(self.payload) if len(data_3) == 0: all_type_list.append([id_2, name_2]) else: for li_3 in data_3: id_3 = li_3['id'] name_3 = name_2 + '&' + li_3['name'] print("3级分类", id_3, name_3) all_type_list.append([id_3, name_3]) print("月度数据总数", len(all_type_list)) for li in all_type_list: self.url_queue.put(li) # print("获取队列中的对象", self.url_queue.get()) crawlers = [ Crawler(self.url_queue, self.page_parse, self.save_data, self.generate_page) for _ in range(0, self.coro_num) ] loop = asyncio.get_event_loop() to_do = [ crawlers[coro_id].asyn_crawl(coro_id) for coro_id in range(0, self.coro_num) ] wait_coro = asyncio.wait(to_do) loop.run_until_complete(wait_coro) loop.run_until_complete(asyncio.sleep(3.25)) loop.close()
def test_fetch_urls(self): html = """ <!DOCTYPE html> <html> <title>Test</title> <body> <a href='link1'/> <a href='link2'/> </body> </html> """ crawler = Crawler('http://test.com') fetch_urls, all_urls = crawler.fetch_urls(html) self.assertListEqual( fetch_urls, ['http://test.com/link1', 'http://test.com/link2']) self.assertSetEqual(all_urls, {'http://test.com/link2', 'http://test.com/link1'})
def run(self): print("开始") self.int() all_type_list = self.area_code() print("城市数", len(all_type_list)) """{'130400': '河北省/邯郸市', '152200': '内蒙古自治区/兴安盟',...}""" for li in all_type_list: self.url_queue.put([li, all_type_list[li]]) # print("获取队列中的对象", self.url_queue.get()) crawlers = [Crawler(self.url_queue, self.page_parse, self.save_data, self.generate_page) for _ in range(0, self.coro_num)] loop = asyncio.get_event_loop() to_do = [crawlers[coro_id].asyn_crawl(coro_id) for coro_id in range(0, self.coro_num)] wait_coro = asyncio.wait(to_do) loop.run_until_complete(wait_coro) loop.run_until_complete(asyncio.sleep(5.25)) loop.close()
def test_parse_html_content(self): html = """ <!DOCTYPE html> <html> <title>Test</title> <link rel="stylesheet" href="/assets/v2/css/app.css"> <link rel="stylesheet" href="/assets/v2/css/app2.css"> <link rel="apple-touch-icon" sizes="152x152" href="/assets/v2/apple-icon-152x152.png"> <link rel="icon" type="image/png" sizes="192x192" href="/assets/v2/android-icon-192x192.png"> <link href="/assets/v2/favicon.ico" rel="shortcut icon"> <link href="/assets/v2/apple-touch-icon.png" rel="apple-touch-icon"> <body> <a href='test1'/> <a href='test2'/> <div class="col-lg-6"> <img src="/assets/v2/images/screen-yoyo-apps-2x.png" class="hero-img"/> </div> <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2.js"></script> </body> </html> """ data = { 'css_links': { 'http://test.com/assets/v2/css/app2.css', 'http://test.com/assets/v2/css/app.css' }, 'js_links': {'http://js.hsforms.net/forms/v2.js'}, 'img_links': {'http://test.com/assets/v2/images/screen-yoyo-apps-2x.png'}, 'icon_links': { 'http://test.com/assets/v2/apple-touch-icon.png', 'http://test.com/assets/v2/android-icon-192x192.png', 'http://test.com/assets/v2/favicon.ico', 'http://test.com/assets/v2/apple-icon-152x152.png' } } crawler = Crawler('http://test.com') result = crawler.parse_html_content(html) self.assertSetEqual(result['css_links'], data['css_links']) self.assertSetEqual(result['js_links'], data['js_links']) self.assertSetEqual(result['img_links'], data['img_links']) self.assertSetEqual(result['icon_links'], data['icon_links'])
def call_API(self): # return self.client.search('SF', self.data) # return SearchResponse ( # self.client._make_request(SEARCH_PATH, self.data) # ) response = SearchResponse( self.client._make_request(SEARCH_PATH, self.data)) list_to_be_returned = [] # for bus in response.businesses: # list_to_be_returned += Crawler.limit("http://www.yelp.com/biz_photos/" + bus.id + "?tab=food&start=0", self.food_per_business) dict_of_urls = {} for bus in response.businesses: # pprint(bus.categories[0].name) # pprint(vars(bus.location.coordinate)) url = "http://www.yelp.com/biz_photos/" + bus.id + "?tab=food&start=0" # list_of_urls.append({url: [bus.location, bus.name]}) # list_of_urls.append({url: category_list = [] for category in bus.categories: category_list.append(category.name) dict_of_urls[url] = dict( address=bus.location.address, name=bus.name, city=bus.location.city, state=bus.location.state_code, postal_code=bus.location.postal_code, display_address=bus.location.display_address, latitude=bus.location.coordinate.latitude, longitude=bus.location.coordinate.longitude, category=category_list) # print dict_of_urls # pprint(list_of_urls) # print (list_of_urls) # Crawler.limit(list_of_urls, 1) return Crawler(dict_of_urls).limit(self.food_per_business)
def call_API(self): response = SearchResponse( self.client._make_request(SEARCH_PATH, self.data)) dict_of_urls = {} for bus in response.businesses: # url = "http://www.yelp.com/biz_photos/"+bus.id+"?tab=food&start=0" category_list = [] if bus.categories: for category in bus.categories: category_list.append(category.name) dict_of_urls[bus.id] = dict( address=bus.location.address, city=bus.location.city, state=bus.location.state_code, postal_code=bus.location.postal_code, display_address=bus.location.display_address, restaurant_name=bus.name, restaurantId=bus.id, latitude=bus.location.coordinate.latitude, longitude=bus.location.coordinate.longitude, category=category_list) print(vars(response)) if response.total == 0: raise RuntimeError("Yelp returns no businesses") if 'query_method' in self.data and self.data['query_method'] == 1: print("DB") food_list = DB(dict_of_urls).query(self.food_per_business) else: print("Yelp") food_list = Crawler(dict_of_urls).query(self.food_per_business) random.shuffle(food_list) return food_list
# -*- coding: utf-8 -*- """main module""" __author__ = 'starstar' import traceback from crawl import Crawler from house import House from connect2db import DbConnector from config import HEADERS import preprocess from connect2db import action_type if __name__ == "__main__": """main""" mysql_connector = DbConnector() shcrawler = Crawler(HEADERS, 'sh') rg_urls = shcrawler.composeurl(1, 20) for region, urls in rg_urls.iteritems(): for url in urls: res = shcrawler.parse(shcrawler.crawl(url)) for i in res: i.region = region preprocess.main(i) try: action, update_fields = mysql_connector.search(i) if action == action_type.insert: mysql_connector.insert(i) elif action == action_type.update: mysql_connector.update(i, update_fields) elif action == action_type.none: print "already inserted and up-to-date"
async def invalid_url(): url = 'http://yoyowalletxxxx.com' crawler = Crawler('') result = await crawler.get_body(url) self.assertEqual(result, '')
class Server(object): """ Server for the virtual filesystem to handle write access to the db and scanning / monitoring of queries. """ def __init__(self, dbdir, scheduler=None): log.info('start beacon') try: self.ipc = kaa.rpc.Server('beacon') except IOError, e: kaa.beacon.thumbnail.thumbnail.disconnect() log.error('beacon: %s' % e) time.sleep(0.1) sys.exit(0) self.ipc.signals['client-connected'].connect(self.client_connect) self.ipc.register(self) self._dbdir = dbdir self._db = Database(dbdir) self._next_client = 0 self._db.register_inverted_index('keywords', min=2, max=30) self._db.register_object_type_attrs("dir", image_from_parser=(bool, ATTR_SIMPLE), last_crawl=(int, ATTR_SIMPLE), title=(unicode, ATTR_SIMPLE), series=(unicode, ATTR_SIMPLE), season=(int, ATTR_SIMPLE), artist=(unicode, ATTR_SIMPLE), album=(unicode, ATTR_SIMPLE), length=(float, ATTR_SIMPLE)) # files self.register_file_type_attrs( "video", title=(unicode, ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX, 'keywords'), poster=(str, kaa.beacon.ATTR_SIMPLE), width=(int, ATTR_SIMPLE), height=(int, ATTR_SIMPLE), length=(float, ATTR_SIMPLE), scheme=(str, ATTR_SIMPLE), description=(unicode, ATTR_SIMPLE), series=(unicode, ATTR_SEARCHABLE), season=(int, ATTR_SEARCHABLE), episode=(int, ATTR_SEARCHABLE), hash=(str, ATTR_SIMPLE), stereo=(str, ATTR_SIMPLE), timestamp=(int, ATTR_SEARCHABLE)) self.register_file_type_attrs( "audio", title=(unicode, ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX, 'keywords'), artist=(unicode, ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX, 'keywords'), album=(unicode, ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX, 'keywords'), genre=(unicode, ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_IGNORE_CASE), samplerate=(int, ATTR_SIMPLE), length=(float, ATTR_SIMPLE), bitrate=(int, ATTR_SIMPLE), trackno=(int, ATTR_SIMPLE), userdate=(unicode, ATTR_SIMPLE), description=(unicode, ATTR_SIMPLE), hash=(str, ATTR_SIMPLE), timestamp=(int, ATTR_SEARCHABLE)) self.register_file_type_attrs( "image", width=(int, ATTR_SEARCHABLE), height=(int, ATTR_SEARCHABLE), comment=(unicode, ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX, 'keywords'), rotation=(int, ATTR_SIMPLE), author=(unicode, ATTR_SIMPLE), hash=(str, ATTR_SIMPLE), timestamp=(int, ATTR_SEARCHABLE)) # tracks for rom discs or iso files self.register_track_type_attrs("dvd", length=(float, ATTR_SIMPLE), audio=(list, ATTR_SIMPLE), chapters=(int, ATTR_SIMPLE), subtitles=(list, ATTR_SIMPLE)) self.register_track_type_attrs("vcd", audio=(list, ATTR_SIMPLE)) self.register_track_type_attrs( "cdda", title=(unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'), artist=(unicode, ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_INVERTED_INDEX, 'keywords')) # list of current clients self.clients = [] # Config file is synced in Thumbnailer. See its constructor for # rationale. config.load(os.path.join(dbdir, "config")) config.watch() if scheduler: config.scheduler.policy = scheduler else: config.autosave = True # commit and wait for the results (there are no results, # this code is only used to force waiting until the db is # set up. self._db.commit() # give database to controller / hardware monitor rootfs = { 'beacon.id': 'root-' + get_machine_uuid(), 'block.device': '', 'volume.mount_point': '/' } self.item_controller = Controller(self, self._db, rootfs) self._db.commit() # load plugins plugins.load(self, self._db) for dir in config.monitors: self.monitor_directory(os.path.expandvars(os.path.expanduser(dir))) # scanner self.scanner = Crawler(self._db, monitor=False)
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
single_row = RowParser(rows[0]) single_row.extract_fields() print(single_row.extracted_content) print("--------Wyciąganie danych ze wszystkich rezultatów--------------") results = [] for i in rows: single_row = RowParser(i) single_row.extract_fields() results.append(single_row.extracted_content) print(results) print(f"length of results: {len(results)}") print("test nowej klasy") parser_ = Parser(a.page_content) parser_.extract_fields() print(parser_.results) print(parser_._log) print("test master obiektu ") crawler = Crawler(search_params) crawler.get_all() print(crawler.results) print(crawler.log)
def main(): crawler = Crawler("http://start.bg/") database = Database() crawler.start()
f.write(self.output_string) f.close() else: print self.output_string def print_tree(self, tree, level): self.output('<li><a href="' + tree.url + '">' + tree.url + '</a></li>', level) if tree.statics: self.output('<b>Static resources:</b>', level) self.output('<ul>', level) for s in tree.statics: self.output( '<li>' + s[0] + ': <a href="' + s[1] + '">' + s[1] + '</a></li>', level) self.output('</ul>', level) if tree.children: self.output('<b>Children:</b>', level) self.output('<ul>', level) for c in tree.children: self.print_tree(c, level + 1) self.output('</ul>', level) starttime = time.time() crawler = Crawler(args.domain) c = crawler.crawl_domain() endtime = time.time() p = Parser(c, args.file) p.render_html(endtime - starttime)
def __init__(self): self.db = RedisClient() self.crawl = Crawler()
from dynamodb import DB import json from crawl import Crawler import threading #Test Parameters with open('params.json', 'r') as file: json_sns = file.read() # DB().fill_empty_restaurant(json.loads(json_sns)) t = threading.Thread(name="Crawler", target=DB().upload_food_list( Crawler(json.loads(json_sns)).query(10))) t.start() t.join() print("NEXT")
def test_parse_empty_html_content(self): html = "" data = {} crawler = Crawler('http://test.com') result = crawler.parse_html_content(html) self.assertDictEqual(result, data)
db = MySqlOperator(server='127.0.0.1', user_name='root', password='', dbname='taobao_sf') rows = db.execute( 'SELECT distinct(itemId) FROM taobao_sf.sf_list_itemid').fetchall() for row in rows: item_id = row[0] print(item_id) url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory' headers = { 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36' } crawler = Crawler() res, session = crawler.crawl(url=url, headers=headers) cookies = res.cookies.get_dict() m_h5_tk = cookies['_m_h5_tk'] app_key = '12574478' data = '{"itemId":"%s"}' % item_id sign, t = get_sign(m_h5_tk, app_key, data) params = { 'jsv': '2.4.2', 'appKey': app_key, 't': t, 'sign': sign, 'api': 'mtop.taobao.GovauctionMTopDetailService.queryHttpsItemDetail', 'v': '2.0', 'ecode': '0', 'type': 'jsonp',
async def valid_url(): url = 'http://yoyowallet.com' crawler = Crawler('') result = await crawler.get_body(url) self.assertTrue(result)
def __init__(self): self.crawler = Crawler() self.redis = RedisClient() self.proxy_list = []
parser.add_argument( '-u', '--url', required=True, type=str, help='For example => http://yoyowallet.com/') parser.add_argument( '-o', '--out', required=True, type=str, help='You have to enter a valid file address') parser.add_argument( '-t', '--type', required=True, type=FileType.from_string, choices=list(FileType), help='You have to choose one of theme => csv or xml') args = parser.parse_args() crawler = Crawler(str(args.url)) task = asyncio.Task(crawler.crawl()) loop = asyncio.get_event_loop() print(f'\n{30*"*"} crawler is working {30*"*"}\n\n') loop.run_until_complete(task) loop.close() result = task.result() print(f'\n\n{30*"*"} crawling was done {30*"*"}\n\n') Export().print(str(args.type), result, str(args.out)) print(f'{30*"*"} output save on {args.out} {30*"*"}\n')
def __init__(self): self.crawler = Crawler() self.mysql_obj = dbMysql.DbMysql(env.DB_HOST, env.DB_PORT, env.DB_USERNAME, env.DB_PASSWORD, env.DB_DATABASE) self.db_obj = dbClass.DbWrapper(self.mysql_obj)