def main(): # will have to do some argparse stuff here to get log dir and the port # logdir and port come from command line global num_values db_name = "tensorview.db" parser = argparse.ArgumentParser( description="Parses relevant parameters for tensorview") parser.add_argument('--port', dest='port', type=int, default=6886, help="Port number to open server in") parser.add_argument( '--logdir', dest='dir', help="Log directory to obtain tensorflow event files from") parser.add_argument( '-n', dest='num', type=int, default=100, help="Metric parameters are aggregated from N most recent iterations") args = parser.parse_args() port = args.port logdir = args.dir num_values = args.num # Rebuild if log directory name was provided rebuild = (logdir is not None) experiments = None if rebuild: experiments = Crawler().crawl(logdir) print("IM RUNNING") Database.initialize_database(db_name, experiments, rebuild) app.run(debug=True, port=port, use_reloader=False)
def test_crawler(self): self.config.crawler.linksA.unlink(missing_ok=True) self.config.crawler.linksB.unlink(missing_ok=True) self.config.crawler.doc_id_mapping.unlink(missing_ok=True) self.config.crawler.doc_raw.unlink(missing_ok=True) self.config.crawler.bloom_filter.unlink(missing_ok=True) self.config.counter.unlink(missing_ok=True) crawler = Crawler(self.config) crawler.fetch() with open(self.config.crawler.doc_id_mapping, "r") as f: lines = f.read().splitlines() [id, u] = lines[0].split("\t") self.assertEqual(id, "1") self.assertEqual(u, self.config.crawler.seed_url) with open(self.config.crawler.linksB, "r") as f: lines = f.read().splitlines() self.assertTrue(len(lines) > 10) with open(self.config.crawler.linksA, "r") as f: lines = f.read().splitlines() self.assertTrue(len(lines) == 0) with open(self.config.crawler.doc_raw, "r") as f: lines = f.read().splitlines() [id, size, content] = lines[0].split("\t") self.assertEquals("1", id) self.assertTrue(len(content) > 0) crawler.fetch() with open(self.config.crawler.linksA, "r") as f: lines = f.read().splitlines() self.assertTrue(len(lines) > 10) crawler.bloom_filter.dump()
def test_file_links(self): self.start_server(TestCrawler.FILE_LINKS_HTML) crawler = Crawler(TestCrawler.SERVER) crawler.crawl() expected = \ '''[ { "assets": [ "http://127.0.0.1:5000/source.pdf", "http://127.0.0.1:5000/source.txt" ], "url": "http://127.0.0.1:5000/" }, { "assets": [], "url": "http://127.0.0.1:5000/source.pdf" }, { "assets": [], "url": "http://127.0.0.1:5000/source.txt" } ]''' self.assertMultiLineEqual(expected, crawler.assets_json())
def test_crawl_max_depth(self): dbschema = "test_crawl_max_depth" self.setup_db(dbschema) session = self.Session() url = 'http://zero.webappsecurity.com/' url_list = ['http://zero.webappsecurity.com/'] link = Link(url) session.add(link) session.commit() max_depth = 1 c = Crawler(0, 'crawler', 'abc123', 'localhost', 0, dbschema, max_depth) result = c.crawl() link_list = session.query(Link) session.close() self.assertEqual(result, 0) self.assertEqual(len([l for l in link_list]), len(url_list)) for link in link_list: self.assertTrue(link.url in url_list) self.assertEqual(link.status, Status.visited)
def run(nombre_producto, bool_teleg, modo_headless, seg_dormidos, prec_min=0, prec_max=20000, num_max_productos=50): # Segundos entre búsquedas segundos_dormidos = seg_dormidos # 3600 seg = 1 hora options = Options() # Modo headless options.headless = False if modo_headless == 'n' else True crawl = Crawler(options) crawl.run(nombre_producto, bool_teleg, prec_min, prec_max, num_max_productos, sleep_time=segundos_dormidos)
def test_crawl_product_found(self): dbschema = "test_crawl_product_found" self.setup_db(dbschema) session = self.Session() url = 'http://www.epocacosmeticos.com.br/212-vip-rose-eau-de-parfum-carolina-herrera-perfume-feminino/p' title = 'Perfume 212 VIP Rosé Carolina Herrera Feminino - Época Cosméticos' name = '212 VIP Rosé Carolina Herrera - Perfume Feminino - Eau de Parfum' link = Link(url) session.add(link) session.commit() c = Crawler(0, 'crawler', 'abc123', 'localhost', 0, dbschema, 1) result = c.crawl() product_list = session.query(Product) session.close() self.assertEqual(result, 0) for product in product_list: self.assertEqual(product.url, url) self.assertEqual(product.title, title) self.assertEqual(product.name, name)
def bdworker(words, fromdate): crawler = Crawler() bd_index = 0 length = 0 filename = "output/baidu_" + fromdate + "_" + words + "_" + time.strftime( '%Y年%m月%d日%H时%M分%S秒', time.localtime(time.time())) + '.xls' while True: length = len(crawler.realResults) crawler.bdrun(words, fromdate, bd_index) if False == crawler.bd_result: break if len(crawler.realResults) == length: break bd_index += 20 print len(crawler.realResults) saveToFile(crawler, filename)
def Foo(page_list): out = [] url = 'https://piyao.kepuchina.cn/rumor/rumorlist' parser = KepuzhongguoParser() title_parser = TitleParser() ua = UserAgent() headers = { "Origin":url, 'User-Agent':ua.random, "Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", # "Accept-Encoding":"gzip, deflate, br", "Content-Type":"text/html; charset=utf-8", } crawler = Crawler() title_list = [] for page in page_list: data = {'type':0,'keyword':0,'page':page} url_list = crawler.crawling_ke(url, rule_model=parser,data=data,headers=headers) for url_1 in url_list: title = crawler.crawling_ke(url_1, rule_model=title_parser, headers=headers) title_list.append(title) # out.append(url_list) return title_list
def add_doc(request): if request.method == "POST": urls_for_indexing = [] uv = URLValidator(schemes=['http', 'https']) urls_from_form = request.POST.get('url') if urls_from_form: list_urls = urls_from_form.split(", ") for url in list_urls: try: uv(url) except ValidationError: continue urls_for_indexing.append(url) file_with_urls = request.FILES.get('file_url') if file_with_urls: for url in file_with_urls: url = url.strip() try: uv(url) except ValidationError: continue urls_for_indexing.append(url) if len(urls_for_indexing): crawler = Crawler(urls_for_indexing, width=20, deep=3) crawler.craaawl() text = 'Finished' else: text = 'Invalid URL' else: text = '' return render(request, 'search/add_doc.html', {'text': text})
def run(self): # Generate crawler processes. print 'Start crawlers processes...' self.__crawlers = [ Crawler(i, self.GPR, self.__crawlers_queue) for i in range(NUMBER_OF_CRAWLERS) ] print 'Done.' self.__crawlers_record = [] for i in range(NUMBER_OF_CRAWLERS): # Put crawlers into available queue. self.__crawlers_queue.put(i) # Start crawler job. self.__crawlers[i].start() self.__crawlers_record.append(datetime.datetime.now()) print 'Ready to accept request.' # Start waiting for request and dispatch to crawler process. while True: req = self.__request_queue.get() done = False while not done: # Get a avaiable crawler process id. id = None try: id = self.__crawlers_queue.get(timeout=MAX_WAITING_TIME) # Reset this crawlers last crawl time. self.__crawlers_record[id] = datetime.datetime.now() except Q.Empty: # If no crawler is available. self._check_crawlers() if not id is None: self.__crawlers[id].add_request(req) done = True
def angiecr(url, fixed, dictionary, custom_set, minus, mayus, numbers, special, size, nthreads): cr = Crawler() if url != None: cr.set_url(url) cr.set_size(size) cr.set_isFixed(True if fixed == 1 else False) if nthreads != 1: cr.set_nthreads(nthreads) if dictionary != None: cr.set_custom_dict_path(dictionary) cr.read_custom_dict() cr.begin_crawl(1) else: if custom_set != None: cr.set_custom_set_path(custom_set) cr.read_custom_set() cr.begin_crawl(0) else: if minus != 0 or mayus != 0 or numbers != 0 or special != 0: cr.select_sets(True if minus == 1 else False, True if mayus == 1 else False, True if numbers == 1 else False, True if special == 1 else False) cr.begin_crawl(0) else: print( "Error: Un ataque de fuerza bruta precisa de un set de caracteres. Selecciona minus, mayus, numbers o especial. También puedes construir un custom set o realizar un ataque de diccionario (--help)." ) else: print( 'Error: falta la url introducida. Para mas información ejecutar con --help' )
def getCommunityInfo(): locationMapping = LocationMapping() locationMappingToInt = LocationMappingToInt() crawler = Crawler(locationMapping, locationMappingToInt) houseInfoParser = HouseInfoParser() dao = DAO() lianjiaSiteName = 'http://sh.lianjia.com' lianjiaXiaoqu = 'xiaoqu' count = 0 # 100 pages for i in xrange(100): crawUrl = lianjiaSiteName + '/' + lianjiaXiaoqu + '/d' + repr(i + 1) + 'rs' communityResp = crawler.crawCommunity(crawUrl) if len(communityResp): communityInfo = houseInfoParser.parseCommunityHttpResponse( communityResp) if not len(communityInfo): break for i in xrange(len(communityInfo)): # If it already exists querySQL = "SELECT * FROM community where community_name='%s'" % communityInfo[ i].strip() if dao.queryForExistence(querySQL): continue count = count + 1 insertSQL = "INSERT INTO community(community_name, community_code) VALUES ('%s', %d)" % ( communityInfo[i].strip(), count) dao.insert(insertSQL) dao.close()
async def echo(message: types.Message): # old style: # await bot.send_message(message.chat.id, message.text) crawler = Crawler() soup = crawler.get_soup(WORLDOMETER) data = [] table = soup.select_one('table#main_table_countries_today') table_body = table.find('tbody') rows = table_body.find_all('tr') for table_row in table_body.find_all('tr'): columns = table_row.findAll('td') output_row = [] for column in columns: output_row.append(column.text.strip().lower()) data.append(output_row) print(message.from_user.full_name) print(message.text) try: result = data[index_2d(data, message.text)[0]] await message.answer( 'Latest Stat from WORLDOMETER for country %s \n' 'Total Cases : %s \n' 'New Cases : %s \n' 'Total Deaths : %s \n' 'New Deaths : %s \n' 'Total Recovered : %s \n' 'Active Cases : %s \n' 'Serious : %s \n' 'Total Case per 1M Population : %s \n' % (result[0], result[1], result[2], result[3], result[4], result[5], result[6], result[7], result[8])) #await message.answer('\n'.join(data[index_2d(data,message.text)[0]])) except: await message.answer( 'please try other command or type country you want to get info')
def atualizar(bot, update, args): chat_id = update.message.chat_id if len(args) == 0: msg = "Matte kudasai!\nVocê não digitou sua senha.\nDigite /atualizar [Senha BU]." bot.send_photo(chat_id=chat_id, photo='http://static.tumblr.com/mxarhwc/kjylpbc32/yui_k-on.png.jpg') bot.send_message(chat_id=chat_id, text=msg) return elif len(args[0]) < 4 or len(args[0]) > 6: msg = "A senha precisa ter de 4 a 6 digitos.\nDigite /atualizar [Senha BU]." bot.send_photo(chat_id=chat_id, photo='https://i.kym-cdn.com/photos/images/newsfeed/000/189/032/1319151441001.png') bot.send_message(chat_id=chat_id, text=msg) return elif not verificarNumeros(args[0]): msg = "Subaru-kun, a senha deve possuir apenas números." bot.send_photo(chat_id=chat_id, photo='https://img.fireden.net/v/image/1484/98/1484987967315.jpg') bot.send_message(chat_id=chat_id, text=msg) return dataBase = Db() if dataBase.haveSelectUser(chat_id): matricula = dataBase.getMatricula(chat_id) crawler = Crawler() if crawler.crawler(matricula, args[0]): msg = "Atualizado!" main_menu_keyboard = [[telegram.KeyboardButton('/livros')]] reply_kb_markup = telegram.ReplyKeyboardMarkup(main_menu_keyboard, resize_keyboard=True, one_time_keyboard=True) bot.send_photo(chat_id=chat_id, photo='http://orig02.deviantart.net/cfa7/f/2012/259/9/e/mami_tomoe_render_by_moeblueberry1771-d5evnl7.png') bot.send_message(chat_id=chat_id, text=msg, reply_markup=reply_kb_markup) else: msg = "What isn't remembered never happened.\nMemory is merely a record.\nYou just need to re-write that record.\Matricula ou senha inválida" bot.send_photo(chat_id=chat_id, photo='https://wired-7.org/lain/src/1558910195719.jpg') bot.send_message(chat_id=chat_id, text=msg) else: msg = "Is this a usuário não cadastrado?\nDigite /cadastrar [Matricula]." bot.send_photo(chat_id=chat_id, photo='https://assets3.thrillist.com/v1/image/2762016/size/tmg-article_default_mobile.jpg') bot.send_message(chat_id=chat_id, text=msg)
def eventFC(crawlParams): seedURLs = crawlParams["seedURLs"] t = [(-1, p, -1, "") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"] = priorityQueue eventModel = EventModel(crawlParams['No_Keywords']) eventModel.buildEventModel(crawlParams['model'], crawlParams['eventType'], minTopicTermFreq=20, minLocTermFreq=crawlParams['minLocTermFreq'], minDateTermFreq=20) crawlParams['scorer'] = eventModel crawler = Crawler(crawlParams) #crawler.crawl() qu = crawler.crawl() quS = '\n'.join([str(-1 * s[0]) + "," + s[1] for s in qu]) with open('queueEvent.txt', 'w') as fw: fw.write(quS.encode('utf8')) return crawler.relevantPages
def __init__(self, job_params): self.task = job_params try: if self.task['crawl'] is True: c = Crawler(self.task) c.crawl() elif self.task['report'] is True: #crawtext.py report <project> [((--email=<email>| -e <email>) -u <user> -p <passwd>)| (-o <outfile> |--o=<outfile>)] Report(self.task) elif self.task['export'] is True: #crawtext.py export [results|sources|logs|queue] <project> [(-o <outfile> |--o=<outfile>)] [-t <type> | --type=<type>] Export(self.task) # elif self.task['extract'] is True: #new method for extract every url except KeyError: print self.task["project"] print "Project %s not configured properly" % str( self.task["project"]) s = Scheduler(self.task) s.delete() print "deleting project"
def main(): crawler = Crawler() def get_photos_from_url(gender, type, url): crawler.open(url) crawler.infinite_scroll() images = crawler.find_elements('img.grid-product__image') srcs = img2src(images) print '{} images got'.format(len(images)) if gender == 'women': srcs = map(lambda src: src.replace('model', 'prod'), srcs) download_images(srcs, os.path.join('hollister', gender, type)) # Men # get_photos_from_url('men', 'top', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=0&rows=90&filtered=true') # get_photos_from_url('men', 'top-1', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=90&rows=90&filtered=true') # get_photos_from_url('men', 'top-2', 'https://www.hollisterco.com/shop/wd/guys-tops/?search-field=&sort=newest&start=180&rows=90&filtered=true') # Women # get_photos_from_url('women', 'top', 'https://www.hollisterco.com/shop/wd/girls-tops/?search-field=&sort=newest&start=0&rows=90&filtered=true') get_photos_from_url( 'women', 'top-1', 'https://www.hollisterco.com/shop/wd/girls-tops/?search-field=&sort=newest&start=90&rows=90&filtered=true' )
def main(): crawler = Crawler() def get_photos_from_url(gender, type, url): crawler.open(url) images = crawler.find_elements('span.back' '> img') srcs = img2src(images) print '{} images got'.format(len(images)) download_images(srcs, os.path.join('lebeige', gender, type)) # Women get_photos_from_url( 'women', 'shirts', 'http://www.ssfshop.com/LEBEIGE/ssfshop/list?dspCtgryNo=SFMA41A02&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF' ) get_photos_from_url( 'women', 'tshirts', 'http://www.ssfshop.com/LEBEIGE/T-shirts/list?dspCtgryNo=SFMA41A01&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF' ) get_photos_from_url( 'women', 'knitwear', 'http://www.ssfshop.com/LEBEIGE/Knitwear/list?dspCtgryNo=SFMA41A03&brandShopNo=BDMA07A06&brndShopId=ECBVF&etcCtgryNo=&ctgrySectCd=&keyword=&leftBrandNM=LEBEIGE_ECBVF' )
def main(): crawler = Crawler() def get_photos_from_url(gender, type, url): crawler.open(url) images = crawler.find_elements( 'a.product-tiles-grid-item-link ' '> div.product-tiles-grid-item-image-wrapper ' '> div.product-tiles-grid-item-image ' '> img') srcs = img2src(images) print '{} images got'.format(len(images)) download_images(srcs, os.path.join('gucci', gender, type)) # Men # get_photos_from_url('men', 'shirts', 'https://www.gucci.com/kr/ko/ca/men/mens-ready-to-wear/mens-shirts-c-men-readytowear-shirts') # Women # get_photos_from_url('women', 'shirts', 'https://www.gucci.com/kr/ko/ca/women/womens-ready-to-wear/womens-tops-shirts-c-women-readytowear-tops-and-shirts') get_photos_from_url( 'women', 'sweatshirts', 'https://www.gucci.com/kr/ko/ca/women/womens-ready-to-wear/womens-sweatshirts-t-shirts-c-women-ready-to-wear-new-sweatshirts' )
def main(token_id, screen_name_path, wpath): start_time = datetime.now() print 'Started at:', start_time crawler = Crawler(token_id=token_id) screen_names = get_screen_names(screen_name_path) count = 0 with open(wpath, 'a') as wf: for sn in screen_names: try: user = crawler.get_user(sn) wf.write(json.dumps(user) + '\n') count += 1 print count, 'users got.' #except tweepy.RateLimitError: # print 'Exceeds rate limit, waiting...' # time.sleep(120) except KeyboardInterrupt: end_time = datetime.now() print 'Ended at:', end_time print 'Duration:', ( end_time - start_time).total_seconds() / 3600., 'hours' exit() except Exception, ex: print ex
from pathlib import Path import click from multiprocessing import Pool from tqdm import tqdm from crawler import Crawler import pandas as pd crawler = Crawler(image_dir='./images', data_dir='./data', list_dir='./lists', page_dir='./pages') def crawl_page(thread, test): urls_pending, saved_pages = crawler.check_pending_pages(test) print("CRAWLING PAGE") print("=" * 60) print(f"downloaded {len(saved_pages)} pages") print(f"downloading remaining {len(urls_pending)} pages") print("=" * 60) # crawling if thread > 1: with Pool(thread) as p: _ = list( tqdm(p.imap_unordered(crawler.crawl_page, urls_pending), total=len(urls_pending))) else: for url in tqdm(urls_pending): _ = crawler.crawl_page(url)
def test_issue(self): crawler = Crawler(self.example_issue) self.assertTrue(crawler.items, True) for item in crawler.items: self.assertEqual(type(item), Issue)
def __init__(self): self.redis = redisclient() self.crawler = Crawler() #调用其他的函数也得初始化
def test_repo(self): crawler = Crawler(self.example_repo) self.assertTrue(crawler.items, True) for item in crawler.items: self.assertEqual(type(item), Repository)
def test_wiki(self): crawler = Crawler(self.example_wiki) self.assertTrue(crawler.items, True) for item in crawler.items: self.assertEqual(type(item), Wiki)
while True: url = q.get() crawler.warm_url(url) q.task_done() # create threads threads = int(os.environ.get('THREADS', 5)) threads = threads if (threads > 0 and threads < 10) else 5 for i in range(threads): worker = threading.Thread(target=add_to_queue, args=(i, QUEUE)) worker.daemon = True worker.start() # start crawling started = time.time() crawler = Crawler() if args.sitemap is not None: offset = args.offset if (args.offset is not None) else 0 crawler.sitemap_crawler(args.sitemap, args.count, offset) else: crawler.google_crawler('ga:%i' % args.id, args.count) # multithreaded cache warmer delay = float(os.environ.get('DELAY', 500)) for url in crawler.urls: QUEUE.put(url) time.sleep(delay / 1000.0) # finsih the queue/threads try: term = threading.Thread(target=QUEUE.join)
import threading from queue import Queue from util import getDomainName from util import initFolder from crawler import Crawler NUM_SPIDERS = 12 DEPTH = 7 HOMEPAGE = 'https://vnexpress.net/' DOMAIN_NAME = getDomainName(HOMEPAGE) initFolder(".") Crawler(DOMAIN_NAME, HOMEPAGE, DEPTH) q = Queue() def work(): while True: url = q.get() Crawler.crawlPage(threading.currentThread().name, url) q.task_done() # Create spider threads (will be terminated when main exits) def createCrawlers(): for spider in range(NUM_SPIDERS): t = threading.Thread(target=work) t.daemon = True t.start() # Each queued link is a new job
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
def __init__(self, url, targetPatterns): self.url = url self.targetPatterns = targetPatterns self.crawler = Crawler(self.url) self.results = []
from settings import RPS, START_URL import asyncio from crawler import Crawler if __name__ == '__main__': # with pool ~14s # best score was 11.5 # without pool ~50s # sync 78s asyncio.run(Crawler(start_url=START_URL, rps=RPS).main())