def do(self): crawler = Crawler(self.url) crawler.run() links = crawler.get_all_links() git_urls = {} # Try to get the git clone URL from the web viewer git_clone_domain = re.match('.*?(.*?)gitweb.*?', self.url).group(1) git_clone_url = '{0}{1}/'.format(git_clone_domain, 'git') for link in links: # Url have the format http://git.lxde.org/gitweb/?p=lxde/lxqt-config-randr.git;a=tree match = re.match('.*?p=(.*?);.*?', link) if match: name = match.group(1) git_urls[name] = git_clone_url + name # The Gerrit project has a single fileset assigned (this) # We empty the fileset and add dynamically the ones referenced by Gerrit self.project.filesets = [] for project in git_urls.keys(): fileset_name = project url = git_urls[project] fileset = GitFileSet(self.project_name, fileset_name, url, '') fileset.set_pattern('.*?ca.po') logging.debug("Gitweb adding {0}-{1}".format(self.project_name, name)) self.project.add(fileset) self.project.do()
def main(self): try: with open('linkdb.pickle'): return pickle.load(open('linkdb.pickle')) except IOError: sys.setrecursionlimit(10000) cw = Crawler() data = cw.main() soup = BeautifulSoup(data) dtable = soup.findAll('table')[1] drows = dtable.findAll('tr') j = 1 pdata = [] while j<len(drows): drele = dtable.findAll('td') k = 1 while k < len(drele): flag = 0 pdict = dict() try: pdict['name'] = drele[k].find('a').contents[0] pdict['link'] = 'http://en.wikipedia.org' + drele[k].find('a')['href'] except: flag = 1 if flag == 1 : k += 1 continue #print pdict pdata.append(pdict) k += 1 j += 1 pickle.dump(pdata, open('linkdb.pickle', 'wb')) return pdata
def baseFC(crawlParams): seedURLs = crawlParams['seedURLs'] t = [(-1,p,-1,"") for p in seedURLs] priorityQueue = PriorityQueue(t) crawlParams["priorityQueue"]=priorityQueue mytfidf = TFIDF() mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords']) #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords']) crawlParams['scorer']=mytfidf #crawler = Crawler(priorityQueue,scorer,options) crawler = Crawler(crawlParams) crawler.crawl() ''' f = open("base-logData.txt","w") furl = open("base-Output-URLs.txt","w") for p in crawler.relevantPages: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n") ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w") ftext.write(p.text.encode("utf-8")) ftext.close() f.close() furl.close() bres = evaluator.evaluateFC(crawler.relevantPages) writeEvaluation(bres,"base-evaluateData.txt") print sum(bres) print len(bres) ''' return crawler.relevantPages
class TestCrawling(unittest.TestCase): def setUp(self): self.crawler =Crawler('crawlerIndex.db') #self.crawler.dropIndexTables() #self.crawler.createIndexTables() def test_crawling(self): categories = list() #categories.append(Category('Administracion/Oficina', ['http://www.computrabajo.com.ar/bt-ofr-SC000-1.htm'])) categories.append(Category('Arte/Diseno/Medios', ['http://www.computrabajo.com.ar/bt-ofr-SC001-1.htm'])) ''' categories.append(Category('Cientifico/Investigacion',['http://www.computrabajo.com.ar/bt-ofr-SC002-1.htm'])) categories.append(Category('Informatica/Telecom', ['http://www.computrabajo.com.ar/bt-ofr-SC003-1.htm'])) categories.append(Category('Direccion/Gerencia', ['http://www.computrabajo.com.ar/bt-ofr-SC004-1.htm'])) categories.append(Category('Economia/Contabilidad', ['http://www.computrabajo.com.ar/bt-ofr-SC005-1.htm'])) categories.append(Category('Educacion/Universidad', ['http://www.computrabajo.com.ar/bt-ofr-SC006-1.htm'])) categories.append(Category('Hosteleria/Turismo', ['http://www.computrabajo.com.ar/bt-ofr-SC007-1.htm'])) categories.append(Category('Ingenieria/Tecnico', ['http://www.computrabajo.com.ar/bt-ofr-SC008-1.htm'])) categories.append(Category('Legal/Asesoria', ['http://www.computrabajo.com.ar/bt-ofr-SC009-1.htm'])) categories.append(Category('Medicina/Salud', ['http://www.computrabajo.com.ar/bt-ofr-SC010-1.htm'])) categories.append(Category('Recursos Humanos', ['http://www.computrabajo.com.ar/bt-ofr-SC011-1.htm'])) categories.append(Category('Otros', ['http://www.computrabajo.com.ar/bt-ofr-SC012-1.htm'])) ''' pagelist = set() for category in categories: print category for url in category.getUrls(): pagelist.add(Page(url,'/bt-ofr-','/bt-ofrd-',category)) self.crawler.crawl(pagelist,5)
def run(self): self.logger.info("the spider has been running!") #create a global thread num for num in range(len(self.spiders)): self.queue.put(num) try: for spider in self.spiders: crawler = Crawler(spider, self.queue) crawler.start() self.queue.join() except: self.logger.error("spider cannot run.") finally: seed_num = self.database.db['seed'].count() textfile = PROJECT_ROOT + '/log/spider.log' self.logger.info("now your seeds num is %s." % seed_num) try: fp = open(textfile, 'rb') content = util.tail(fp) fp.close() sub = 'bt-share-log-%s' % datetime.now() send_mail(['*****@*****.**', ], sub, content) except: self.logger.error(traceback.format_exc())
def eventFC(scorer, url_scorer,options): # seedUrls = ["http://www.cnn.com/2013/09/27/world/africa/kenya-mall-attack/index.html", # "http://www.youtube.com/watch?v=oU9Oop892BQ", # "http://ifrc.org/en/news-and-media/press-releases/africa/kenya/kenya-red-cross-society-continues-to-provide-vital-support-to-victims-and-families-of-the-westgate-shopping-mall-attack/" # ] #keywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military'] t = [(-1,p,-1) for p in options['seeds']] #t = [(-1,Url(p)) for p in seedUrls] priorityQueue = PriorityQueue(t) crawler = Crawler(priorityQueue,scorer,options) crawler.set_url_scorer(url_scorer) crawler.enhanced_crawl() print crawler.relevantPagesCount print crawler.pagesCount f = open("harverstRatioData.txt","w") for r,p in crawler.harvestRatioData: f.write(str(r) + "," + str(p) + "\n") f.close() f = open("logData.txt","w") furl = open("Output-URLs.txt","w") for p in crawler.relevantPages: f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n") furl.write(p.pageUrl[1]+"\n") f.close() furl.close()
def post(self): seedURL = cgi.escape(self.request.get('seed')) depth = cgi.escape(self.request.get('depth')) keyword = cgi.escape(self.request.get('keyword')) if seedURL == None or seedURL.strip() == "": self.response.out.write("Fill in the seed URL, fool!") return if depth == None or depth.split() == "": self.response.out.write("Fill in the search depth, fool!") return if keyword == None or keyword.split() == "": self.response.out.write("Fill in the search keyword, fool!") return try: depth = int(depth) except: self.response.out.write("Depth should be a number, fool!") return crawler = Crawler(seedURL, depth, keyword) urls = crawler.crawl() self.response.out.write('<table class="results">') self.response.out.write('<tr><th>URL</th><th>Level</th></tr>') for (url, level) in urls: self.response.out.write('<tr>') self.response.out.write('<td><a class="link" href="%s">%s</a></td>' % (url, url)) self.response.out.write('<td><span class="level">%s</span></td>' % level) self.response.out.write('</tr>') self.response.out.write('</table>')
def do(self): crawler = Crawler(self.url) crawler.run() links = crawler.get_all_links() self._download_links(links, self.temp_dir) self.build()
def crawl(c = None, seed = []): if c == None: c = Crawler( seed = seed, # your seed urls here default_crawl_delay = 20, obey_robots_txt = True, document_fetchers = 15, robots_txt_fetchers = 5) #start at least this many celery workers try: # start crawling, with this tasks specific termination criteria and # a save period of 20 seconds c.crawl( termination_checker = example_task_termination_checker, save_frequency = timedelta(seconds = 20)) finally: # if we were killed or finished, suspend crawl state to file. # revive the crawl with resume from crawler.py to explore results print "\nSuspended crawl to " + c.suspend() # print some statistics print "Downloaded bytes: " + str(cstats.downloaded_bytes(c)) print "Discovered links: " + str(cstats.discovered_links(c)) print "Discovered domains: " + str(cstats.discovered_domains(c)) print "Runtime: " + str(cstats.runtime(c)) + " seconds" maxref = cstats.most_prolific_referer(c) # utf-8 printing problem in domain? print "Most prolific referrer was " + maxref["name"] + " with an average of " + str(maxref["avg_links_per_page"]) + " outgoing links per page."+"\n"
def get_docs(self): crwl=Crawler() for page in self.pagelist: if page != '#' and page != 'mailto:[email protected]' and page !=None: if(crwl.get_page(page)!=True): continue soup=crwl.return_soup() content=soup.find("div",{"class":"article-text"}) if content != None: div=content.find('div',id='articleKeywords') if div != None: div.decompose() div=content.find('div',id='addshare') if div != None: div.decompose() div=content.find('div',{'class':'rel-block-sec'}) if div != None: div.decompose() div=content.find('div',{'class':'photo-caption'}) if div != None: div.decompose() div=content.find('div',{'class':'related-column'}) if div != None: div.decompose() x=[s.extract() for s in content('script')] text=content.text text=re.sub('[\n]+',' ',text) text=re.sub('[ ]+',' ',text) text=text.strip() if(len(text)<=10): self.error_pagelist.append(page) else: self.final_docs.append(text)
def main(): # You can provide whatever query you like e.g. 'Barrack Obama', 'isis', 'mongodb' # example_query = 'isis' # Searching for 'isis' keyword and getting relevant snippets related to this keyword. # search_results_snippets = Quora.get_snippets_by_query(example_query) # Saving list of snippets obtained by query: 'isis' under 'snippets' collection in 'quora' db # db.snippets.insert({example_query: search_results_snippets}) connection_str = 'mongodb://localhost:27017/' quora_db = 'quora' # Creating crawler object with limited crawling depth crawler = Crawler(connection_str, quora_db, maxdepth=2) seed = 'What-is-terrorism' # Starting crawling # crawler.crawl_by_question(seed) # The guy who originally asked seed question user = '******' # Crawling by user # crawler.crawl_by_user(user) crawler.crawl_questions_and_answers()
class CrawlerTestCase(unittest.TestCase): ''' Testing the functionality of the crawler ''' def setUp(self): ''' Create define a data dir, create crawler. ''' self.crawler = Crawler('data') def test_download_content(self): ''' Download content from TEST_URL and check against known content for said url. ''' content = self.crawler.download_content(TEST_URL) assert 'projects' in content assert 'resume' in content def test_crawl(self): ''' Download the content from a number of urls and save them to data directory ''' self.crawler.crawl(TECH_URLS, 'technology') self.crawler.crawl(COOKING_URLS, 'cooking') assert 'technology' in os.listdir(self.crawler.data_dir) assert 'cooking' in os.listdir(self.crawler.data_dir) def tearDown(self): pass
def test_generate_node_urls(self): c = Crawler(d) c.crawl_nodes_api(page_limit=1) try: c.generate_node_urls() except: self.fail("crawler.generate_node_urls() failed")
def reset(): global call_reset_last time_since_last_call = time.time() - call_reset_last if time_since_last_call >= call_reset_timeout: Crawler.reset() call_reset_last = time.time() time_since_last_call = 0 return "%i000" % (call_reset_timeout - time_since_last_call)
def flush(): global call_flush_last time_since_last_call = time.time() - call_flush_last if time_since_last_call >= call_flush_timeout: Crawler.flush() call_flush_last = time.time() time_since_last_call = 0 return "%i000" % (call_flush_timeout - time_since_last_call)
def test_scrape_url(self): c = Crawler(d) try: c._scrape_pages(['http://google.com', 'http://google.com/']) f = open('google.com/index.html') f.close() except: self.fail("page didn't save / get scraped at all")
def test_profile_urls_updated_by_crawl(self): c = Crawler(d) l1 = c.user_urls.copy() c.crawl_users_api(page_limit=1) l2 = c.user_urls.copy() self.assertEqual(len(l1), 0) self.assertGreater(len(l2), len(l1)) self.assertNotEqual(l1, l2)
def test_institutions_urls_updated_by_crawl(self): c = Crawler(d) l1 = c.institution_urls.copy() c.crawl_institutions_api(page_limit=1) l2 = c.institution_urls.copy() self.assertEqual(len(l1), 1) self.assertGreater(len(l2), len(l1)) self.assertNotEqual(l1, l2)
def test_registration_urls_updated_by_crawl(self): c = Crawler(d) l1 = c.registration_url_tuples.copy() c.crawl_registrations_api(page_limit=1) l2 = c.registration_url_tuples.copy() self.assertEqual(len(l1), 0) self.assertGreater(len(l2), len(l1)) self.assertNotEqual(l1, l2)
def test_http_counts_as_internal_link(self): self.svc.get('requests')._expect("https://example.com", 200, '<a href="http://example.com/insecure">click here</a>') self.svc.get('requests')._expect("http://example.com/insecure", 200, '<different><stuff>') crawler = Crawler(self.svc, "https://example.com") siteMap = crawler.map() self.assertEqual({"https://example.com":{"assets":[], "links":["http://example.com/insecure"]}, "http://example.com/insecure": {"assets": [], "links": []}}, siteMap)
def post(self): seed = self.request.get('seed') maxpages = int(self.request.get('maxpages')) maxdepth = int(self.request.get('maxdepth')) rest = int(self.request.get('rest')) my_crawler = Crawler(seed, maxpages, maxdepth, rest) my_crawler.crawl_web() my_crawler.compute_ranks()
def test_relative_links_are_captured(self): self.svc.get('requests')._expect("http://example.com", 200, '<a href="foobar/">click here</a>') self.svc.get('requests')._expect("http://example.com/foobar/", 200, '') crawler = Crawler(self.svc, "http://example.com") siteMap = crawler.map() self.assertEqual({"http://example.com":{"assets":[], "links":["foobar/"]}, "http://example.com/foobar/": {"assets": [], "links": []}}, siteMap)
def test_disallowed_urls_are_not_fetched(self): self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com/admin">click here</a>') self.svc.get('RobotFileParser')._disallowed_urls['http://example.com/admin'] = True crawler = Crawler(self.svc, "http://example.com") siteMap = crawler.map() self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com/admin"]}, "http://example.com/admin": {"error": "Disallowed by robots.txt"}}, siteMap)
def test_query_params_are_captured(self): self.svc.get('requests')._expect("http://example.com", 200, '<a href="/?foo=bar">click here</a>') self.svc.get('requests')._expect("http://example.com/?foo=bar", 200, '<different><stuff>') crawler = Crawler(self.svc, "http://example.com") siteMap = crawler.map() self.assertEqual({"http://example.com":{"assets":[], "links":["/?foo=bar"]}, "http://example.com/?foo=bar": {"assets": [], "links": []}}, siteMap)
class GetMookClass: def __init__(self): self.c = Crawler() self.db = DBHelper('localhost', 'root', '', 'test', 3306) self.lorder = int(time.time()) def usage(self): print ''' -h print this message -e everyday run, just check first page and find new classes -a all refresh, check all the pages and add new classes ''' def run(self): if len(sys.argv) == 1: self.usage() else: try: opts, args = getopt.getopt(sys.argv[1:], "hea") for op, value in opts: if op == "-h": self.usage() elif op == "-e": self.startCrawl() elif op == "-a": self.startCrawl(1) else: self.usage() except: self.usage() def startCrawl(self, all=0): self.c.login("http://www.imooc.com/course/list", "http://www.imooc.com/user/login") if(all): index = 1 while(self.crawlSinglePage(index)): index = index + 1 else: self.crawlSinglePage(1) def crawlSinglePage(self, pageId): url = 'http://www.imooc.com/course/list?page=%d' % pageId classes = self.c.getClasses(url) if(len(classes) == 0): return 0 else: for cls in classes: dbcls = self.db.selectClassByMid(cls.mid) if(not dbcls): cls.lorder = self.lorder cid = self.db.insertClass(cls) self.refreshTitles(cid, cls.mid) return 1 def refreshTitles(self, cid, mid): titles = self.c.getTitles(cid, mid) pid = 0 for title in titles: if(title.mid == 0): pid = self.db.insertTitle(title) else: title.pid = pid self.db.insertTitle(title)
def do(self): self.create_tmp_directory() crawler = Crawler(self.url) links = crawler.run() self._download_links(links, self.temp_dir) self.build() self.remove_tmp_directory()
def test_main_funtion(): url = "http://7gogo.jp/talks/YtykfykuJfMT" my_tester = Crawler() talk_id, username = get_talk_id(url) loop = asyncio.get_event_loop() task = asyncio.async(my_tester.run(talk_id, username, 1417268169)) loop.run_until_complete(task)
def get_events(): from crawler import Crawler crawler = Crawler() print "[main] getting news" crawler.get_top_news() print "[main] getting festivals" crawler.get_festivals()
def cache_fill_loop(): global sources while True: # fill cache up to min_cache_imgs if Crawler.info()["images"] < min_cache_imgs_before_refill: while Crawler.info()["images"] < min_cache_imgs: random.choice(sources).crawl() # sleep for non-invasive threading ;) time.sleep(1.337)
def __init__(self): crwl=Crawler() crwl.get_pagelist() self.pagelist=crwl.return_pagelist() self.soup=crwl.return_soup() self.articledb=ArticleDb('localhost',27017) self.articledb.init_backend('testdb','testcol') self.final_docs=[] self.error_pagelist=[]
def run(self, ent_number=0): """爬取的主函数 """ return Crawler.run(self, ent_number)
#!/usr/bin/env python from flask import Flask, jsonify, request from crawler import Crawler app = Flask(__name__) crawler = Crawler() app.debug = True @app.route('/') def index(): return 'Hello, World!' @app.route('/search') def search(): keyword = request.args.get('w') if keyword: res = crawler.search(keyword) else: res = {'message': 'No keyword sent'} return jsonify(res) if __name__ == '__main__': app.run(port=80)
from crawler import Crawler crawler = Crawler() flipkart_url = "https://www.flipkart.com/search?q=iphone%207&as=on&as-show=on&otracker=start&as-pos=2_q_iph" amazon_url = "http://www.amazon.in/s/ref=nb_sb_ss_i_4_6?url=search-alias%3Daps&field-keywords=iphone+7&sprefix=iphone%2Caps%2C284&crid=ZNKHKONNIHBA" crawler.auto_crawl(flipkart_url)
if minID[0][0] is not None: minID = minID[0][0] start = minID end = start + 5 while True: websiteQuery = "SELECT websiteURL, websiteID FROM website WHERE websiteID >= '{0}' AND websiteID < '{1}'".format( start, end) websites = d.executeSelectQuery(websiteQuery) if websites is not []: for website in websites: # print(website[1]) # print(website[0]) c = Crawler(d) c.crawl(website[0]) del c start = start + 5 end = start + 5 maxID = d.executeSelectQuery(maxIdQuery) if maxID[0][0] is not None: maxID = maxID[0][0] if start > maxID: start = minID end = start + 5
if __name__ == '__main__': from crawler import Crawler crawler = Crawler() crawler.to_csv( crawler.extract_product_info( crawler.filter_urls( crawler.crawl("https://www.epocacosmeticos.com.br/"))))
def setup_crawler(): global crawler # create the crawler from loaded constants print('setting up crawler ...') crawler = Crawler(session, api_id, api_hash, rabbitmq_channel)
"Login do studenckiego maila(wpisz razem z @stud...): ") login_data["password_m"] = input("haslo do mail: ") login_data["notification_address"] = input( "Na jaki mail wysylac powiadomienia?: ") if input("Czy dane sa poprawne[T/N]: ") == "T": accept = True with open(filename, 'wb') as f: pickle.dump(login_data, f, protocol=pickle.HIGHEST_PROTOCOL) return login_data if __name__ == "__main__": login_data = load_configuration() c = Crawler(login_data.get("login_d"), login_data.get("password_d")) local_path = pth.dirname(pth.abspath(__file__)) old_marks_file_name = local_path + "/old_marks.html" tmp_marks = c.getMarksInHtmlTable() # TODO: utf-8 should be default marks = str(tmp_marks.encode('utf-8')) try: f = open(old_marks_file_name, 'r') old_marks = f.read() if old_marks != marks: send(login_data.get("login_m"), login_data.get("notification_address"), login_data.get("password_m"), marks) f.close()
# -*- coding: utf-8 -*- import json import time from crawler import Crawler from teleBot import TeleBot from datetime import date # Modules Setting crawler = Crawler() teleBot = TeleBot() # Start! while True: # 시간 및 횟수 측정! today, now = date.today(), time.strftime('%H%M%S') print("*" * 15, "{} {}시 {}분에 시작~".format(today, now[:2], now[2:4]), "*" * 15) # Crawl and Send telegram message past_data = crawler.load_past_data('crawled_data.json') new_data = crawler.crawl_data() teleBot.send_message(past_data, new_data) # Save new_info teleBot.update_and_save_data(past_data, new_data, 'crawled_data.json') print("Finished") time.sleep(600)
def index(): crawler_ = Crawler(database_url) print('iniciado') crawler_.indexar()
def read_excel_file(excel_file: str) -> int: new_excel_file = "new_" + excel_file isbn: str = "" isbn_code: str = "" description: str = "" method = Method.GET headers = {"Accept-Encoding": "gzip, deflate", "User-Agent": "Mozillla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Accept": "*/*", "Connection": "Keep-Alive"} timeout = 10 encoding: Optional[str] = None description_col_num = 28 config = Config() if not config: logger.error("can't read configuration") sys.exit(-1) collection_conf = config.get_collection_configs() url_prefix = collection_conf["url_prefix"] encoding = collection_conf["encoding"] logger.debug("url_prefix=%s" % url_prefix) workbook = xlrd.open_workbook(excel_file) worksheet1 = workbook.sheet_by_index(0) num_rows = worksheet1.nrows new_workbook = xlwt.Workbook() new_worksheet = new_workbook.add_sheet("Sheet1", cell_overwrite_ok=True) crawler = Crawler(method, headers, timeout, encoding) for row_num in range(num_rows): do_crawl = True do_extract = False row = worksheet1.row_values(row_num) isbn = str(row[0]) try: isbn_code = convert_isbn(isbn) except ValueError as e: do_crawl = False logger.debug("isbn=%s" % isbn_code) if do_crawl: url = url_prefix + isbn_code logger.debug("url=%s" % url) html = crawler.run(url) #logger.debug("html=%s" % html) # ISBN -> bid state = 0 for line in html.split('\n'): if state == 0: m = re.search(r'<ul class="basic" id="searchBiblioList"', line) if m: state = 1 elif state == 1: m = re.search(r'<a href="(?P<url>http://book.naver.com/[^"]+)"', line) if m: url = m.group("url") logger.debug(url) html = crawler.run(url) do_extract = True if not html: logger.warning("can't get response from '%s'" % url) sys.exit(-1) break if do_extract: row[description_col_num] = extract_element(html) logger.debug("len=%d" % len(row[description_col_num])) #logger.debug("row[description_col_num]=%s" % row[description_col_num]) with open("test.%d.html" % row_num, "w") as outfile: outfile.write(row[description_col_num]) outfile.write("\n") for col_num in range(len(row)): new_worksheet.write(row_num, col_num, row[col_num]) # 테스트용으로 첫번째 건 수행 이후에 종료 #if do_crawl: #print(row[description_col_num]) #break; new_workbook.save(new_excel_file) return 0
import multiprocessing as mp from crawler import Crawler if __name__ == "__main__": Crawler().crawl()
def run(self, ent_number=0): Crawler.run(self, ent_number) '''
help="print verbose output") parser.add_argument( "--output", action="store", default="sitemap.xml", help="File path for output, if file exists it will be overwritten", ) # parsing parameters args = parser.parse_args() url = args.url.rstrip("/") found_links = [] # initializeing crawler crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose) # fetch links links = crawler.start() # write into file with open(args.output, "w") as file: file.write( '<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">' ) priority = 1.0 date = datetime.now().strftime("%Y-%m-%d") for link in links: #if "infinitysports.ai" not in link: file.write( "\n\t\t<url>\n\t\t\t<loc>{0}{1}</loc>\n\t\t\t<lastmod>{2}</lastmod>\n\t\t\t<priority>{3}</priority>\n\t\t</url>"
class WebDeface(object): """Class for WebDeface.""" def __init__(self, args): """Intialize WebDeface.""" if int(platform.sys.version_info[0]) < 3: # if Python 2.X.X self.url = raw_input(">> Enter the URL of the website: ") self.thread = int(raw_input(">> Enter the number of threads: ")) else: self.url = input(">> Enter the URL of the website: ") self.thread = int(input(">> Enter the number of threads: ")) if (self.url is not None and deface_utils.verify_url(self.url)): # Create crawler object self.crawler_obj = Crawler(url=self.url, threads=self.thread) self.crawler_obj.threading_crawl() # Create a cache object self.cache_obj = Cache() self.cache_obj.generate_cache() # Arguments self.args = args # Initialize empty objects self.twitter_obj = None self.slack_obj = None self.telegram_obj = None self.twilio_sms_obj = None def create_notifier_objs(self): """ Create notification medium objects. Args: None Raises: None Returns: None """ # Parse all the arguments if (self.args.twitter_api_key and self.args.twitter_access_token and self.args.twitter_api_secret_key and self.args.twitter_access_token_secret): cred = {} cred["api_key"] = self.args.twitter_api_key cred["access_token"] = self.args.twitter_access_token cred["api_secret_key"] = self.args.twitter_api_secret_key cred["access_token_secret"] = self.args.twitter_access_token_secret self.twitter_obj = twitter.Twitter(cred) if (self.args.twilio_to and self.args.twilio_from and self.args.twilio_token and self.args.twilio_sid): cred = {} cred["twilio_to"] = self.args.twilio_to cred["twilio_sid"] = self.args.twilio_sid cred["twilio_token"] = self.args.twilio_token cred["twilio_from"] = self.args.twilio_from self.twilio_sms_obj = twilio_sms.Twilio(cred) if (self.args.slack_token and self.args.slack_user_id): cred = {} cred["token"] = self.args.slack_token cred["user_id"] = self.args.slack_user_id self.slack_obj = slack.Slack(cred) if (self.args.telegram_user_id and self.args.telegram_bot_token): cred = {} cred["user_id"] = self.args.telegram_user_id cred["token"] = self.args.telegram_bot_token self.telegram_obj = telegram.Telegram(cred) def start(self): """ Start Web Deface Detection. Args: None Returns: None Raises: None """ print("[!] Remote Web Deface Detection started") # Create a monitor object self.monitor_obj = Monitor(twitter=self.twitter_obj, slack=self.slack_obj, twilio_sms=self.twilio_sms_obj, telegram=self.telegram_obj) # Start the monitor loop self.monitor_obj.monitor()
def __init__(self): self.redis = RedisClient() self.crawler = Crawler()
parser.add_argument( '--body', help= 'The application/x-www-form-urlencoded body content to send (\'param1=value1¶m2=value2\')' ) parser.add_argument( '--method', help='request method \'GET\' or \'POST\' (defaults to "GET")', default='GET') parser.add_argument( '--threads', help='Number of threads to use while crawling the site', default=5) parser.add_argument('--db', help='sqlite3 database name. (defaults to \'xss.db\')', default="xss.db") args = parser.parse_args() # Initialize thread pool thread_pool = ThreadPoolExecutor(max_workers=int(args.threads)) # Start crawling and scanning the other found URLs crawler = Crawler( URLRequest(url=args.url, method=args.method, body=args.body), search_for_xss, thread_pool) crawler.start() for r in crawler.func_result: store_xss_result(r, args.db)
def main(): logger.info('Starting...') c = Crawler() c.start() logger.info('Finished.')
def produtos(index=True): crawler_ = Crawler(database_url, index=index) crawler_.search_produtos()
from crawler import Crawler from indexer import Indexer from query_processor import QuerryProcessor from document import Document from time import sleep if __name__ == '__main__': # sleep(5.0) # print("THREAD-TIME!") crawler = Crawler('https://www.in.gr', 20, 5, True, 'BFS') crawler.initializeCrawl() ind = Indexer(Crawler.documents) query = input("Enter your search query:") ind.add_document(Document('search_query', query)) print('Building Indexer...') ind.create_indexer() print('Calculating TF-IDFs. May take a while.') ind.calculate_scores() qp = QuerryProcessor(ind.inverted_index, len(ind.documents)) docs_with_cos_ = qp.compare_documents() docs_with_cos_ = sorted( docs_with_cos_, key=lambda x: x[1], reverse=True) # sorting based on cosine similarity scores print(f'Showing top results based on your query "{query}":') for doc in docs_with_cos_: print(doc[0].link)
''' Created on 2014. 8. 27. @author: lsh ''' import logging #meerkat modules from crawler import Crawler if __name__ == '__main__': crawler = Crawler() print 'crawler activated' logging.info("Server Start..") try: crawler.collect_document() except: logging.exception('')
def main(): proxyips = Crawler.run() logger.info('Crawler finish, total ip: %s', len(proxyips)) sniffer = Sniffer() sniffer.run(proxyips)
from website import Website from crawler import Crawler crawler = Crawler() # site_data = [ # ['O\'Reilly Media', 'http://oreilly.com', # 'https://ssearch.oreilly.com/?q=', 'article.product-result', # 'p.title a', True, 'h1', 'section#product-description'], # ['Reuters', 'http://reuters.com', # ''], # ['Brookings', 'http://www.brookings.edu', 'h1', # 'div.post-body'], # ['New York Times', 'http://nytimes.com', 'h1', # 'p.story-content'], # ] site_data = [[ 'O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=', 'article.product-result', 'p.title a', True, 'h1', 'section#product-description' ]] sites = [] for row in site_data: sites.append( Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])) topics = ['python'] for topic in topics:
def test_get_url_contents_checks_cache(self, mock_get_key): mock_get_key.return_value = "abc" crawler = Crawler(self.cache, Mock()) with patch('crawler.requests') as mock_requests: crawler._get_url_contents("myurl") self.cache.exists.assert_called_once_with('abc')
def setUp(self): self.test_crawler = Crawler("aladinfoods.bg")
from selenium import webdriver import pandas as pd from bs4 import BeautifulSoup from crawler import Crawler import os url = "https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/topic-pages/tables/table-43" options = webdriver.ChromeOptions() options.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe" driver = webdriver.Chrome(options=options) spider = Crawler(url, options, driver) spider.inspect() tables = spider.read_tables() i = 0 if os.path.isdir("FBI_Data"): pass else: os.mkdir("FBI_Data") for table in tables: i += 1 table.to_excel(f"FBI_Data/Table{i}.xlsx")
def test_init_sets_user_agent(self): crawler = Crawler(self.cache, Mock()) self.assertEqual(USER_AGENT, crawler._headers['User-agent'])
#!/usr/bin/python # -*- coding: utf-8 -*- # filename: run.py import re from crawler import Crawler, CrawlerCache if __name__ == '__main__': # Using SQLite as a cache to avoid pulling twice crawler = Crawler(CrawlerCache('crawler.db')) root_re = re.compile('^/$').match crawler.crawl('http://techcrunch.com/', no_cache=root_re) #crawler.crawl('http://www.engadget.com/', no_cache=root_re) #crawler.crawl('http://gizmodo.com/', no_cache=root_re) #crawler.crawl('http://www.zdnet.com/', no_cache=root_re) #crawler.crawl('http://www.wired.com/', no_cache=root_re)
from multiprocessing import Lock from crawler import Crawler, ThreadScheduler from constants import SITE_URL, MAX_PAGES_TO_CRAWL, MAX_CONCURRENT_REQUESTS, DOWNLOAD_DELAY if __name__ == "__main__": lock = Lock() crawler = Crawler(base_url=SITE_URL, max_pages_to_crawl=MAX_PAGES_TO_CRAWL, lock=lock) thread_scheduler = ThreadScheduler( crawler=crawler, max_concurrent_requests=MAX_CONCURRENT_REQUESTS, download_delay=DOWNLOAD_DELAY) thread_scheduler.run() print("Page visited: %s, Bytes downloaded: %s" % (crawler.total_page_visited, crawler.bytes_downloaded))
class CSSParser(object): """ This class is a CSS parser of css **font** declarations. CSSParser instantiates his own Crawler, because of downloading extern cascade style sheet files. Supported are: DECLARATIONS: External css declarations <link> and @import Internal css declarations in <style type='text/css'> PARSING PRIORITY (where 1=lowest and 3 = highest priority): 1. TAG 2. CLASS 3. ID PARSED SELECTORS: tag a {} class .myClass {} id .myNewId {} tag.class a.myClass{} grouped selectors: a, b, .myClass {} Not supported: Specificity (parsing is only on basis of priority which is shown above) Contextual selectors (like .myclass span a) Pseudo-classes and pseudo-elements (a:hover) Directive #style>body (whats the name of this??) Inline css declarations - THIS IS IN TODO! """ def __init__(self): self._crawler = Crawler() self._crawler.set_handler(FileDownloader) self.cleaner = _MyCSSCleaner() self._last_url = None self._url = None self._rules = [] self.cssfiles = [] # init tokenizer (scanner) self.tokenizer = CSSTokenizer() # css style parser converts css rules to css styles self.cssstyleparser = _CSSStyleParser() # element -> font style mapper maps lxml elements to CSSStyle instances self._elem2style_map = Element2CSSStyleMapper() def _identical_domain(self, url1, url2): if url1 == None or url2 == None: return False p1 = urlparse(url1) p2 = urlparse(url2) return p1.netloc == p2.netloc def _get_onpage_styles(self): stylefields = self.elemtree.findall(".//style") _css = '' for style in stylefields: if style.get('type') != None and style.get('type') == 'text/css': _css += style.text self.tokenizer.parse_source(_css) self._rules.extend( self.tokenizer.get_rules() ) def _get_css_files(self): # Method returns True if some css are to download, False otherwise. self.last_cssfiles = self.cssfiles if self.ident_last_domain: # If we had last URL's domain identical like this url, we have probably # the same css files. So check it! if self.cssfiles and set(self.cssfiles) == set(self.last_cssfiles): return False else: # delete css file list self.cssfiles = [] # handle css 2.0 imports of extern files styles = self.elemtree.findall(".//style") for style in styles: if style.get('type') != None and style.get('type') == 'text/css': if style.text is not None and re.search("@import", style.text, re.I): urlre = re.search('^(http|https|ftp)\://[a-z0-9\-\.]+\.[a-z]{2,3}(:[a-z0-9]*)?/?' + \ '([a-z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$', style.text, re.I) urlre != None and self.cssfiles.append(urlre.group(0)) # handle usual <link> declarations of extern css files links = self.elemtree.findall(".//link") for link in links: if link.get('type') != None and link.get('type') == 'text/css' \ and link.get('href') != None: link.make_links_absolute(self._url) self.cssfiles.append(link.get('href')) return len(self.cssfiles) != 0 def parse(self, elemtree, url): """ Main method for parsing. @param elemtree - lxml.etree._ElementTree of the page which is to be parsed @param url - URL identifier of the page @return CSSStyleContainer object with parsed css declarations. """ # css parsing order # 1. Browser default # 2. External style sheet # 3. Internal style sheet # 4. Inline style FIXME not supported yet! self.elemtree = elemtree self._url = url # make all links absolute root = self.elemtree.getroot() root.make_links_absolute(self._url) # If we had last URL's domain identical like this url, we are probably # on the same site but different web page. There is very high probability # that we will have identical css files, so there's no need to download # and parse them again. if not self._identical_domain(self._url, self._last_url): self._styles = [] self.ident_last_domain = False else: self.ident_last_domain = True # get css files if needed if self._get_css_files(): # download css sheets files = self._crawler.start(self.cssfiles) for f in self.cssfiles: try: # and parse them self.tokenizer.parse_source(files[f]) self._rules.extend( self.tokenizer.get_rules() ) except TypeError: pass self._last_url = self._url # parse on-page definitions self._get_onpage_styles() # create cascade style sheet self._sheet = CascadeStyleSheet(self._rules) # stylesheet is instance of CSSSelector2CSSStyleMapper self._selector2style_map = self.cssstyleparser.get_style_mapper(self._sheet) # parse font styles for elem in root.iterdescendants(): style = CSSStyle() style.parse_element(elem, self._selector2style_map, self._elem2style_map) elem.style = style def get_sheet(self): return self._sheet
def __init__(self, user): super().__init__() self.crawler = Crawler(user) f1.write(user + '\n')