def crawling(agent, proxy, redirect, timeout, url, cookie): return crawler.Crawler(agent=agent, proxy=proxy, redirect=redirect, timeout=timeout, url=url, cookie=cookie).process()
def main(): site_crawler = crawler.Crawler(DOMAIN) collection = site_crawler.crawl() print collection.get_len() data_saver = saver.DatabaseWorker() for url, content in collection.pages_content(): nodes = parser.get_elements(content, REGULARS) data_saver.save_item(url, nodes) save_to_db(url, nodes)
def index(request): try: indexed_url = request.GET['request'] if indexed_url: c = crawler.Crawler(indexed_url, 0) # Add setting to change depth c.crawl() except: pass return render(request, 'index.html', {})
def index(request): t = None indexed_url = request.GET.get('request', None) if indexed_url: ts = time.time() with open('search/static/settings/settings.json', 'r') as f: depth = int(json.loads(f.read()).get("depth")) c = crawler.Crawler(indexed_url, depth) c.crawl() t = time.time() - ts return render(request, 'index.html', {'time': t})
from crawler import crawler from crawler.naver_news_parser import NaverNewsParser from crawler.naver_blog_parser import NaverBlogParser from crawler.naver_cafe_parser import NaverCafeParser from crawler.naver_realtime_parser import NaverRealtimeParser from crawler.aagag_mirror_parser import AagagMirrorParser naver_news_crawler = crawler.Crawler('네이버뉴스', NaverNewsParser(), '네이버', '뉴스') naver_blog_crawler = crawler.Crawler('네이버블로그', NaverBlogParser(), '네이버', '블로그') naver_cafe_crawler = crawler.Crawler('네이버카페', NaverCafeParser(), '네이버', '카페') naver_realtime_crawler = crawler.Crawler('네이버실시간검색', NaverRealtimeParser(), '네이버', '실시간검색') aagag_mirror_parser = crawler.Crawler('커뮤니티AAGAG', AagagMirrorParser(), '커뮤니티', 'AAGAG')
def crawler(self,a,p,r,t,u,c): return crawler.Crawler( agent=a,proxy=p,redirect=r,timeout=t,url=u,cookie=c ).process( )
from utils import cons from crawler import crawler import threading from datetime import datetime cr = crawler.Crawler() def main_crawler(url): cr.get_restaurant_content(url) def main(): now = datetime.now() # start timing print(now) thread = [] url_list = [] dic_cat = cr.get_all_cat_url_from_db(cons.CITIES['shenzhen']) for item in dic_cat: url = cons.DIAN_PING_URL + str(item['url']) main_crawler(url) # next = False # for url_str in url_list: # if url_str == url: # next = True # break # if next == False: # print('Now to get -------- ' + url) # t = threading.Thread(target=main_crawler, # args=(url,)) # thread.append(t)
from datetime import datetime from socketIO_client_nexus import SocketIO, LoggingNamespace import crawler.crawler as crawler import database.database as database def onEvent(event, *args): if event == 'finish': socketIO.emit('finish', args) elif event == 'progress': socketIO.emit('progress', args) def onNodeCommand(_parameters): print(_parameters) if _parameters['type'] == "database": m_crawler.output() elif _parameters['type'] == "start_crawler": m_crawler.Start(_parameters['args']) elif _parameters['type'] == "cancel_crawler": m_crawler.Stop() m_crawler = crawler.Crawler() m_crawler.setEventDelegate(onEvent) m_crawler.setDatabase(database.Database()) socketIO = SocketIO('127.0.0.1', 3000, LoggingNamespace) socketIO.on('command', onNodeCommand) socketIO.emit("whoamI", "crawler") socketIO.wait()
def __init__(self): self.redis = config.redis_server self.crawler = crawler.Crawler() self.database = database.Database() self.logger = logger.create_logger('worker')