def test_crawler(self): crawler.crawler(['-b', 'PublicServan', '-i', '1', '2']) filename = 'PublicServan-1-2.json' with codecs.open(filename, 'r', encoding='utf-8') as f: data = json.load(f) # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles self.assertEqual(len(data['articles']), 39) os.remove(filename)
def crawler_task(*args, **kwargs): # Permite la ejecucion de manera asincronica la busqueda de palabras url = kwargs.get('url') max_visited = kwargs.get('max_visited') session_number = kwargs.get('session_number') website_id = kwargs.get('website_id') logger.debug( 'Using {} {} for crawler_task'.format(url, max_visited )) urls = [url] visited = [url] crawler(url=url, urls=urls, visited=visited, max_visited=max_visited, session_number=session_number, website_id=website_id)
def test_resolvedIndex2(self): with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) resolved_inverted_index_dict = self._crawler.get_resolved_inverted_index() self.assertEqual(resolved_inverted_index_dict['page'], set(['http://individual.utoronto.ca/peixizhao/', u'http://individual.utoronto.ca/peixizhao/branch1.html']))
def job(): logging.info("start cron") res = crawler() try: with open("/app/application/grade.json") as f: last = json.load(f) except IOError: with open("/app/application/grade.json", "w") as f: json.dump(res, f) return if last != res: with open("/app/application/grade.json", "w") as f: json.dump(res, f) diff = dict(set(res.items()) ^ set(last.items())) t = "" for k, v in diff.iteritems(): k = k.encode("utf-8") v = v.encode("utf-8") t += k + ":" + v + "\n" if notification == "mail" : sendmail(t) elif notification == "twitter" : sendtwit(t)
def copymagnet2clipboard(self, event): # wxGlade: MainFrame.<event_handler> page = self.SearchNotebook.GetCurrentPage() if page: selected = page.torrList.GetSelectedObject() if selected and selected.magneturl: plugin_file = open('plugins/'+ selected.plugin + '.json','r') plugin = json.loads(plugin_file.read()) plugin_file.close() if 'crawler' in plugin['magnet_url_filter']: config = self.ReadConfig() try: magneturl = crawler(selected.magneturl,plugin['magnet_url_filter']['crawler'],plugin['headers'],config['timeout']) except StandardError,msg: self.ReportError("Can\'t establish a connection. Reason:"+str(msg)) return else: magneturl = selected.magneturl dataObj = wx.TextDataObject() dataObj.SetText(magneturl) if wx.TheClipboard.Open(): wx.TheClipboard.SetData(dataObj) wx.TheClipboard.Close() self.ReportInfo("Torrent's Magnet Url has been copied to the clipboard.") else: self.ReportError("Unable to open the clipboard") else: self.ReportInfo("Plugin didn't fetch any magnet URL")
def test_crawler(self): """ Tests the crawler by comparing its results to manually verified results. """ global test_case_result # Run the crawler and store the results. bot = crawler(None, "test.txt") bot.crawl(depth=1) inverted_index = bot.get_inverted_index() resolved_inverted_index = bot.get_resolved_inverted_index() # Check that the result contains the correct number of words. self.assertTrue(len(resolved_inverted_index) == len(test_case_result), "incorrect number of words found.") for key in resolved_inverted_index: # Check that each word is in the precomputed results. self.assertTrue(key in test_case_result, "unexpected word: {key}.".format(key = key)) # Check that each word maps to the correct number of urls. self.assertTrue( len(resolved_inverted_index[key]) == len(test_case_result[key]), "incorrect number of urls for word: {key}.".format(key = key)) for url in resolved_inverted_index[key]: # Check that each url is correct. self.assertTrue(url in test_case_result[key], "unexpected url: <{url}>.".format(url = url))
def setUp(self): mock_doc_index = { 1: (1, 2, 3), 2: (2, 3, 4, 5), 3: (3, 4, 5, 1) } mock_word_cache = { 'hello': 1, 'world': 2, 'jelly': 3, 'beans': 4, 'green': 5 } mock_doc_cache = { 'http://example.com': 1, 'http://example.com/123': 2, 'http://someotherexample.com': 3 } self.bot = crawler(None, '') self.bot._doc_id_cache = mock_doc_cache self.bot._word_id_cache = mock_word_cache self.bot._doc_index = mock_doc_index
def deploy(): # run crawler print "Please wait while we are recreating the database" os.system("rm -f %s" % DB_FILE) db_conn = lite.connect(DB_FILE) bot = crawler.crawler(db_conn, URLS_TXT_FILE) bot.crawl(depth=2) print "Crawler Finished" #change to decorator # aws setup print "Please wait while we are creating the instance" public_ip, instance_id, key_pair_path = aws_setup.setup() print "AWS Setup Finished" # scp print "Please wait while we setup the app in AWS" os.system("rm -rf ./bottle-0.12.7/data/") # delete cache for faster scp os.system("scp -r -o StrictHostKeyChecking=no -i %s ../csc326/ ubuntu@%s:~/" % (key_pair_path, public_ip)) os.system("ssh -o StrictHostKeyChecking=no -i %s ubuntu@%s nohup python csc326/runner.py" % (key_pair_path, public_ip)) print "App Launched" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Public IP Address: %s" % public_ip print "Instance ID: %s" % instance_id print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" return public_ip
def update_crawl_data(battle): ''' Method used to update battle record with crawl stats and winner. Params: battle row object ''' # do actual crawling and returns dict of data res = crawler(battle.hashtag1, battle.hashtag2) # sample res= {'num_tweet_winner': 'PURPOSETOUR', #'tag2_num_tweets': '21', 'tag2_num_spell_errors': '68', #'tag1_num_spell_errors': '82', 'tag1_num_tweets': '19', #'num_spell_winner': 'PURPOSETOUR'} # update battle status to done if its end date has past crawl_status = 'R' tz_info = battle.battle_end.tzinfo if battle.battle_end <= datetime.now(tz_info): crawl_status = 'D' # updated battle record with crawled data Battle.objects.filter(battle_id=battle.battle_id).update( crawl_status=crawl_status, tag1_num_tweets=res['tag1_num_tweets'], tag2_num_tweets=res['tag2_num_tweets'], tag1_num_spell_errors=res['tag1_num_spell_errors'], tag2_num_spell_errors=res['tag2_num_spell_errors'], num_tweet_winner=res['num_tweet_winner'], num_spell_winner=res['num_spell_winner'], ) return True
def Toolbar_TransmMagneturl(self, event): # wxGlade: MainFrame.<event_handler> page = self.SearchNotebook.GetCurrentPage() if page: selected = page.torrList.GetSelectedObject() if selected and selected.magneturl: config = self.ReadConfig() data = config['transmission'] plugin_file = open('plugins/'+ selected.plugin + '.json','r') plugin = json.loads(plugin_file.read()) plugin_file.close() if 'crawler' in plugin['magnet_url_filter']: config = self.ReadConfig() try: magneturl = crawler(selected.magneturl,plugin['magnet_url_filter']['crawler'],plugin['headers'],config['timeout']) except StandardError,msg: self.ReportError("Can\'t establish a connection. Reason:"+str(msg)) return else: magneturl = selected.magneturl if not self.tc: try: self.tc = transmissionrpc.Client(address=data['host'],port=data['port'],user=data['user'],password=data['pass']) except transmissionrpc.TransmissionError,original: msg = "Can\'t connect to Transmission client:" + str(original) self.ReportError(msg) return -1 try: self.tc.add_uri(magneturl) self.ReportInfo(str('Magnet URL successfully sent to Transmission!')) except transmissionrpc.TransmissionError,original: msg = "Can\'t add magnet to Transmission:" + str(original) self.ReportError(msg)
def test_crawl_depth_0_invertedIndex(self): """If the depth is 0 then only the words from main page should be crawled""" with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt') self._crawler.crawl(depth = 0) self.assertEqual(self._crawler.get_inverted_index(), {1: set([1]), 2: set([1]), 3: set([1]), 4: set([1]) , 5: set([1]), 6: set([1]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1])})
def test_invertedIndex2(self): """test individual element in the returned result""" with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) inverted_index_dict = self._crawler.get_inverted_index() self.assertEqual(inverted_index_dict[3], set([1,2]))
def main(urls_file, _depth): drop_db() bot = crawler(None, urls_file) bot.crawl(depth=_depth) doc_id_index(bot.get_links(), bot.get_inverted_doc_id_cache(), bot.get_url_description()) word_id_index(bot.get_word_id(), bot.get_inverted_index())
def test_invertedIndex(self): with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) self.assertEqual(self._crawler.get_inverted_index(), {1: set([1, 2]), 2: set([1]), 3: set([1, 2]), 4: set([1, 2]) , 5: set([1, 2]), 6: set([1, 2]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1]), 11: set([2]), 12: set([2]), 13: set([2]), 14: set([2]), 15: set([2]), 16: set([2])})
def test_crawler(self): crawl_data = crawler(self.tag1, self.tag2) self.assertTrue('tag2_num_tweets' in crawl_data) self.assertTrue('tag1_num_tweets' in crawl_data) self.assertTrue('tag1_num_spell_errors' in crawl_data) self.assertTrue('tag2_num_spell_errors' in crawl_data) self.assertTrue('num_spell_winner' in crawl_data) self.assertTrue('num_tweet_winner' in crawl_data)
def start_crawling(self): #method to start crawling seed=self.seed.get("1.0","end-1c") #tkMessageBox.showinfo("Title",seed) craw=crawler(seed) count=0 message="" #crawling algorithm here :) :) while not craw.is_empty() and count<5 : #configure count's value according to your choice count=count+1 url=craw.remove_url() if not craw.is_visited(url): craw.add_visited(url) pattern=re.compile('http://*') pattern1=re.compile('.*pdf') #ignoring pdf pattern2=re.compile('.*gif') #ignoring gif pattern3=re.compile('.*jpg') #ignoring jpg pattern4=re.compile('.*jpeg') #ignoring jpeg pattern5=re.compile('.*link') #ignoring forwarding error 503 to some extent matcher=pattern.match(url) matcher1=pattern1.match(url) matcher2=pattern2.match(url) matcher3=pattern3.match(url) matcher4=pattern4.match(url) matcher5=pattern5.match(url) if matcher and not (matcher1 or matcher2 or matcher3 or matcher4 or matcher5): print url + "\nThis is a valid url \nGoing to crawl it NOW !" #start crawling here request=urllib2.Request(url) response=urllib2.urlopen(request) #content=response.decode('utf-8') #could handle exception here content=response.read() #getting the content from the current uniform resource locator :) cool stuff #parsing the html content to get all the href links from current page soup=BeautifulSoup(content) #creating soup of content #write code to save this content in some file on the disk filename="/home/bhaskar/Documents/programming/python/pythonGuiTkinter/crawlerProject/crawledFiles/"+soup.title.string+".txt" fileObject=open(filename,"wb") fileObject.write(soup.prettify()) print "URL's found in this Page are :" #enqueuing all the href links from the current page/resource for link in soup.findAll('a'): craw.add_url(link.get('href')) print link.get('href') #terminating for testing else: #message=message+"\n\n"+url + "\n\nThis is a invalid url \nNot going to crawl it NOW !" print url+"\nThis is invalid url\nNot going to crawl it !" print "I have reached the end" self.show.config(state="normal") #enabling the show downloaded files button
def test_Lexicon(self): with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) with self._crawler._db_conn: c = self._crawler._db_conn.cursor() c.execute('SELECT * FROM Lexicon WHERE words=?', ('facebook',)) result = c.fetchone()[1] self.assertEqual(result, 'facebook')
def test_Document(self): with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) with self._crawler._db_conn: c = self._crawler._db_conn.cursor() c.execute('SELECT * FROM Document WHERE doc_url=?', ('http://individual.utoronto.ca/peixizhao/',)) result = c.fetchone()[1] self.assertEqual(result, 'http://individual.utoronto.ca/peixizhao/')
def do_GET(self): parse_object = query_parser(parser(self.path).query)['url'][0] #gets URLs from query parameters requests_response = crawler(parse_object) if requests_response != 'ERR': self.send_response(200) self.send_header("Content-type", requests_response[-1]) self.send_header("Access-Control-Allow-Origin", '*') self.end_headers() self.wfile.write(requests_response[0]) # sends actual content to be displayed else: self.send_error(400)
def battle_result(request, id=None): ''' Method for showing battle result for selected battle id. Params: request <django request object> id <integer> battle id ''' instance = get_object_or_404(Battle, battle_id=id) res = crawler(instance.hashtag1, instance.hashtag2) messages.success(request, res, extra_tags='html_safe') return redirect('battles:list')
def test_resolved_inverted_index_(self): #Create object crawler with an empty text file con = lite.connect("dbFile.db") c = crawler(None, "urls_test.txt") c.crawl() #Check inverted index self.assertEqual(c._inverted_index_str[u'languages'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html'])) self.assertEqual(c._inverted_index_str[u'csc326'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html'])) self.assertEqual(c._inverted_index_str[u'programming'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']))
def testOne(self): bot = crawler(None, "urls.txt") bot.crawl(depth=2) inverted_index = bot.inverted_index() resolved_inverted_index = bot.resolved_inverted_index() expected_inverted_index = {1: set([1, 2]), 2: set([1, 2])} expected_resolved_inverted_index = {u'index2': set(['http://hhaider.github.io/mygithubpage/index.html', u'http://hhaider.github.io/mygithubpage/index2.html']), u'index': set(['http://hhaider.github.io/mygithubpage/index.html', u'http://hhaider.github.io/mygithubpage/index2.html'])} self.failUnless(inverted_index == expected_inverted_index) self.failUnless(resolved_inverted_index == expected_resolved_inverted_index)
def test_multi_thread_crawler(): """ 1) test multithread crawler on test.html 2) a) single thread crawl http://www.eecg.toronto.edu b) multi thread crawl http://www.eecg.toronto.edu c) compare result NOTE: single thread and multi thread result may be different due to different timeout, so please try multiple runs """ try: #1) test local testcase print " test multi thread on test.html" bot = cmt.crawler(None, "test_url.txt") bot.crawl(depth=0) assert len(bot.get_links()) == 2 assert len(bot.get_word_id()) == 15 assert len(bot.get_inverted_index()) == 15 # 2)compare again single thread result # a)single thread crawl http://www.eecg.toronto.edu print " compare multi thread result with single thread" start_time = time.time() single = cs.crawler(None, "urls.txt") single.crawl(depth=1) single_time = time.time() - start_time # b)multi thread crawl http://www.eecg.toronto.edu start_time = time.time() multi = cmt.crawler(None, "urls.txt") multi.crawl(depth=1) multi_time = time.time() - start_time delta = single_time - multi_time print "/////IMPROVE//////////" print "//////%d secs/////////" % delta print "////////////////////" # c)compare result # print "####Compare num of links" print "links" assert abs(len(single.get_links()) - len(multi.get_links())) < ALLOWRANCE # print "####Compare num of word id" print "word_id" assert abs(len(single.get_word_id()) - len(multi.get_word_id())) < ALLOWRANCE # print "####Compare num of inverted index" print "inverted" assert abs(len(single.get_inverted_index()) - len(multi.get_inverted_index())) < ALLOWRANCE except: logging.exception("") return False return True
def test_inverted_index(): print "Test Inverted Index" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'urls3.txt') c.crawl() expected_inverted_index = {1: set([1]), 2: set([1]), 3: set([1])} if c.get_inverted_index() == expected_inverted_index: print "Success!" else: print "Fail! Wrong inverted_index"
def test_resolved_inverted_index(): print "Test Resolved Inverted Index" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'urls3.txt') c.crawl(depth=1) expected_resolved_inverted_index = {u'languages': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']), u'csc326': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']), u'programming': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html'])} if c.get_resolved_inverted_index() == expected_resolved_inverted_index: print "Success!" else: print "Fail! Wrong resolved_inverted_index"
def test_empty_resolved_inverted_index(): print "Test Empty Resolved Inverted Index" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'invalid.txt') c.crawl() expected_resolved_inverted_index = {} if c.get_resolved_inverted_index() == expected_resolved_inverted_index: print "Success!" else: print "Fail! With invalid *.txt file, crawler must have empty resolved_inverted_index"
def test_crawler_db_results(): print "Test Crawler Database" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'urls2.txt') c.crawl() expected_urls = [u'http://help.websiteos.com/websiteos/example_of_a_simple_html_page.htm'] if crawler_db.get_sorted_urls("head", "dbFile_tester.db") == expected_urls: print "Success!" else: print "Fail! Wrong crawler_db results"
def test_inverted_index_with_two_urls(): print "Test Inverted Index with Two URLs" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'urls2.txt') c.crawl() expected_inverted_index = {1: set([1]), 2: set([1]), 3: set([1]), 4: set([1]), 5: set([1]), 6: set([1]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1]), 11: set([1]), 12: set([1]), 13: set([1]), 14: set([1]), 15: set([1]), 16: set([1]), 17: set([1]), 18: set([1]), 19: set([1]), 20: set([1]), 21: set([1]), 22: set([1]), 23: set([1]), 24: set([1]), 25: set([1]), 26: set([1]), 27: set([1]), 28: set([1]), 29: set([1]), 30: set([1]), 31: set([1]), 32: set([1]), 33: set([1]), 34: set([1]), 35: set([1]), 36: set([1]), 37: set([1]), 38: set([1]), 39: set([1]), 40: set([1]), 41: set([1]), 42: set([1]), 43: set([1]), 44: set([1]), 45: set([1]), 46: set([1]), 47: set([1]), 48: set([1]), 49: set([1]), 50: set([1]), 51: set([1]), 52: set([1]), 53: set([1]), 54: set([1]), 55: set([1]), 56: set([1]), 57: set([1]), 58: set([1]), 59: set([1]), 60: set([1]), 61: set([1]), 62: set([1]), 63: set([1]), 64: set([1]), 65: set([1]), 66: set([1]), 67: set([1]), 68: set([1]), 69: set([1]), 70: set([1]), 71: set([1]), 72: set([1]), 73: set([1]), 74: set([1]), 75: set([1]), 76: set([1]), 77: set([1]), 78: set([1]), 79: set([1]), 80: set([1]), 81: set([1]), 82: set([1]), 83: set([1]), 84: set([1]), 85: set([2]), 86: set([2]), 87: set([2])} if c.get_inverted_index() == expected_inverted_index: print "Success!" else: print "Fail! Wrong inverted_index"
def test_PageRank_branch(self): with open('test_urls.txt', 'w') as f: f.write("http://individual.utoronto.ca/peixizhao/") self._crawler = crawler(None, 'test_urls.txt', 'test.db') self._crawler.crawl(depth = 1) with self._crawler._db_conn: c = self._crawler._db_conn.cursor() c.execute('SELECT * FROM Document WHERE doc_url=?', ('http://individual.utoronto.ca/peixizhao/branch1.html',)) branch_doc_id = c.fetchone()[0] c.execute('SELECT rank FROM PageRank WHERE DocId=?', (branch_doc_id,)) result = c.fetchone()[0] self.assertEqual(result, 0.0)
def testOne(self): bot = crawler(None, "urls.txt") bot.crawl(depth=2) inverted_index = bot.inverted_index() print inverted_index expected_inverted_index = {1: set([1, 3]), 2: set([1]), 3: set([2])} got_page_rank = page_rank(bot.links()) expected_page_rank = {1: 0.05000000000000001, 2: 0.092500000000000027, 3: 0.12862500000000002} self.failUnless(inverted_index == expected_inverted_index) self.failUnless(got_page_rank == expected_page_rank)
def crawler(url): from crawler import crawler f13_data = crawler(url) return f13_data
from flask import Flask, render_template, request, redirect import searchengine, neuralnet, crawler searcher = searchengine.searcher('searchengine.db') crawler = crawler.crawler('searchengine.db') nnet = neuralnet.searchnet('nn.db') app = Flask(__name__) @app.route("/") def search(): if request.args: queryText = request.args.get('q') (wordids, scores, urlIdsList, urlsList) = searcher.query(queryText) if len(urlIdsList) != 0: listOfItems = [{'id': urlIdsList[i], 'url': urlsList[i], 'score': scores[i]} for i in range(len(urlIdsList))] else: listOfItems = [] return render_template('index.html', list=listOfItems, q=queryText) return render_template('index.html', list=None) @app.route('/train', methods=['POST', 'GET']) def train(): if request.method == 'POST': queryPhrase = request.json['q'] selectedURLId = int(request.json['clicked']) app.logger.debug('queryPhrase: %s => selectedURLId: %s' %(queryPhrase, selectedURLId)) (wordids, scores, urlIdsList, urlsList) = searcher.query(queryPhrase) nnet.trainquery(wordids, urlIdsList, selectedURLId)
import time import datetime from crawler import crawler from parser import parser from html_generator import gen_html todays_date = datetime.datetime.now().date() #~ name = "{}_{}_{}".format(todays_date.month,todays_date.day,todays_date.year) name = "new" print "Crawling started:" crawler(name) print "Parsing Started:" parser(name) print "Generating HTML:" gen_html(name) print "Done!"
import crawler ptt = crawler.crawler() ptt.setup_keyword('ptt') #batch_id = ptt.get_link_session() #ptt.crawl_ptt_link(batch_id) #ptt.close_conn()
import crawler import houses_parse import pymysql import getregions if __name__ == '__main__': response = crawler.crawler( 'https://bj.lianjia.com/ershoufang/city?city_id=110000') regions_info = getregions.getinfos() base_url = 'https://m.lianjia.com/bj/ershoufang/' for city_region, city_region_info in regions_info.items(): for region, url in city_region_info.items(): i = 1 while True: houses_url = base_url + url + '/pg' + str(i) houses_info_response = crawler.crawler(houses_url) result = houses_parse.parse_ishashouse(houses_info_response) if len(result) != 0: break houses_parse.parse_houseinfo(city_region, region, houses_info_response) i = i + 1
# -*- coding: utf-8 -*- """ Created on Fri Oct 4 07:35:36 2019 @author: pathouli """ from crawler import crawler my_path = 'C:/Users/Timothy/Google Drive/TC Stuff/Analytics/GR 5067 - Natural Language Processing in Social Sciences/HW2/files_q1' the_query = 'qmss columbia' num_docs = 50 my_func = crawler() my_func.write_crawl_results(my_path, the_query, num_docs)
from crawler import crawler import pprint import sqlite3 as sql if __name__ == "__main__": dbFile = 'dbFile1.db' crawler(dbFile, "urls.txt") con = sql.connect(dbFile) cur = con.cursor() query = """ SELECT docIndex.url, pageRank.score FROM pageRank, docIndex WHERE pageRank.docid = docIndex.docid ORDER BY pageRank.score DESC""" cur.execute(query) ranks = cur.fetchall() con.close() print "Page Rank Scores per URL:" pprint.pprint(ranks)
def main(): crawler.crawler("/wiki/PageRank") pagerank(crawler.graph, 0.3)
def __init__(self): self.redis = redisclient() self.crawler = crawler()
def main(): scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
def setUp(self): self.bot = crawler(None, '') # def test_word_id(self): # insert a few words into the lexicon, # and check that _word_id_cache and _revert_word_id # map each word to its word_id correctly self.assertEqual(self.bot.word_id('apple'), 1) self.assertEqual(self.bot.word_id('lemon'), 2) self.assertEqual(self.bot.word_id('mango'), 3) self.assertEqual(self.bot.word_id('melon'), 4) self.assertEqual(self.bot.word_id('peach'), 5) self.assertEqual(self.bot._word_id_cache['apple'], 1) self.assertEqual(self.bot._word_id_cache['lemon'], 2) self.assertEqual(self.bot._word_id_cache['mango'], 3) self.assertEqual(self.bot._word_id_cache['melon'], 4) self.assertEqual(self.bot._word_id_cache['peach'], 5) self.assertEqual(self.bot._revert_word_id[1], 'apple') self.assertEqual(self.bot._revert_word_id[2], 'lemon') self.assertEqual(self.bot._revert_word_id[3], 'mango') self.assertEqual(self.bot._revert_word_id[4], 'melon') self.assertEqual(self.bot._revert_word_id[5], 'peach') # def test_doc_id(self): # insert a few URLs into the document index, # and check that _doc_id_cache and _revert_doc_id # map each URL to its doc_id correctly self.assertEqual(self.bot.document_id('google.com'), 1) self.assertEqual(self.bot.document_id('facebook.com'), 2) self.assertEqual(self.bot.document_id('instagram.com'), 3) self.assertEqual(self.bot._doc_id_cache['google.com'], 1) self.assertEqual(self.bot._doc_id_cache['facebook.com'], 2) self.assertEqual(self.bot._doc_id_cache['instagram.com'], 3) self.assertEqual(self.bot._revert_doc_id[1], 'google.com') self.assertEqual(self.bot._revert_doc_id[2], 'facebook.com') self.assertEqual(self.bot._revert_doc_id[3], 'instagram.com') # def test_add_words_to_document(self): # pretend that crawl() has just visited the web page, # and now insert words that are found to the document self.bot._curr_doc_id = 1 self.bot._curr_words = [(1, 1), (2, 1), (3, 1)] self.bot._add_words_to_document() self.bot._curr_doc_id = 2 self.bot._curr_words = [(2, 1), (3, 1), (4, 1)] self.bot._add_words_to_document() self.bot._curr_doc_id = 3 self.bot._curr_words = [(3, 1), (4, 1), (5, 1)] self.bot._add_words_to_document() # def test_doc_index(self): expected_doc_index = { 1: set([1, 2, 3]), 2: set([2, 3, 4]), 3: set([3, 4, 5]), } self.assertEqual(expected_doc_index, self.bot.get_doc_index()) # def test_inverted_index(self): expected_inverted_index = { 1: set([1]), 2: set([1, 2]), 3: set([1, 2, 3]), 4: set([2, 3]), 5: set([3]), } self.assertEqual(expected_inverted_index, self.bot.get_inverted_index()) # def test_resolved_inverted_index(self): expected_resolved_inverted_index = { 'apple': set(['google.com']), 'lemon': set(['google.com', 'facebook.com']), 'mango': set(['google.com', 'facebook.com', 'instagram.com']), 'melon': set(['facebook.com', 'instagram.com']), 'peach': set(['instagram.com']) } self.assertEqual(expected_resolved_inverted_index, self.bot.get_resolved_inverted_index())
def test_inverted_index_with_two_urls(): print "Test Inverted Index with Two URLs" delete_db_file("dbFile_tester.db") db_conn = lite.connect("dbFile_tester.db") c = crawler(db_conn, 'urls2.txt') c.crawl() expected_inverted_index = { 1: set([1]), 2: set([1]), 3: set([1]), 4: set([1]), 5: set([1]), 6: set([1]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1]), 11: set([1]), 12: set([1]), 13: set([1]), 14: set([1]), 15: set([1]), 16: set([1]), 17: set([1]), 18: set([1]), 19: set([1]), 20: set([1]), 21: set([1]), 22: set([1]), 23: set([1]), 24: set([1]), 25: set([1]), 26: set([1]), 27: set([1]), 28: set([1]), 29: set([1]), 30: set([1]), 31: set([1]), 32: set([1]), 33: set([1]), 34: set([1]), 35: set([1]), 36: set([1]), 37: set([1]), 38: set([1]), 39: set([1]), 40: set([1]), 41: set([1]), 42: set([1]), 43: set([1]), 44: set([1]), 45: set([1]), 46: set([1]), 47: set([1]), 48: set([1]), 49: set([1]), 50: set([1]), 51: set([1]), 52: set([1]), 53: set([1]), 54: set([1]), 55: set([1]), 56: set([1]), 57: set([1]), 58: set([1]), 59: set([1]), 60: set([1]), 61: set([1]), 62: set([1]), 63: set([1]), 64: set([1]), 65: set([1]), 66: set([1]), 67: set([1]), 68: set([1]), 69: set([1]), 70: set([1]), 71: set([1]), 72: set([1]), 73: set([1]), 74: set([1]), 75: set([1]), 76: set([1]), 77: set([1]), 78: set([1]), 79: set([1]), 80: set([1]), 81: set([1]), 82: set([1]), 83: set([1]), 84: set([1]), 85: set([2]), 86: set([2]), 87: set([2]) } if c.get_inverted_index() == expected_inverted_index: print "Success!" else: print "Fail! Wrong inverted_index"
from crawler import crawler import sys import random crawler = crawler(None, "url-for-test.txt") crawler.crawl(depth=1) URL_1 = "https://marksachinperera.github.io/" URL_2 = "https://marksachinperera.github.io/ContactMe.html" URL_3 = "https://marksachinperera.github.io/AboutMe.html" ###****************#### ID_1 = -1 ID_2 = -1 ID_3 = -1 ###****************#### print "getting inverted index" inverted_index = crawler.get_inverted_index() print "getting resolved index" resolved_index = crawler.get_resolved_inverted_index() print "Setting up" word_list_1 = { "jpg", "height", "done", "have", "home", "portfolio", "alt", "web", "le", "img", "personal", "mark", "width", "1500", "styles", "picture", "resume", "showing", "welcome", "hi", "img_2315", "perera", "projects", "me", "src", "about", "name", "1800", "this", "contact", "my", "page" }
from crawler import crawler from quiz.models import Player_info from csvwriter import csvwriter def main(): """Run administrative tasks.""" os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'myapi.settings') try: from django.core.management import execute_from_command_line except ImportError as exc: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " "forget to activate a virtual environment?") from exc execute_from_command_line(sys.argv) if __name__ == '__main__': main() data = crawler() for item in data: Player_info(number=item['number'], name=item['name'], position=item['position'], age=item['age'], nation=item['nation'], team=item['team'], value=item['value'], photo=item['photo']).save() #csvwriter()
import crawler crawler.crawler( crawler.get_content('https://www.ptt.cc/bbs/WomenTalk/index.html'))
'unit': get_unit(food_info[0]) } res = requests.post(url, json=food_json) if DEBUG_MOD: print('Post food ' + json.dumps(food_json)) # return { 'result': 'fail', 'description': ''} if __name__ == '__main__': with open(CATEGORY_BRANDS_LIST_FILE, 'r', encoding='utf8') as f: done = False while not done: data = f.readline().split() if data == []: done = True break category, brands = data[0], data[1:] category_id = get_category_id(category) for brand in brands: brand_id = get_brand_id(brand, category_id) brand_foods = crawler.crawler(brand) for food in brand_foods: food_name, details = food_refine(food) post_food(food_name, details, category_id, brand_id) # To be update post_food fail exception -> logging
import pprint from crawler import crawler import urllib2 import urlparse from bs4 import BeautifulSoup from bs4 import Tag from collections import defaultdict import redis from pagerank import page_rank import re # Testing File for Lab 3 # What I did in crawler was that I saved the page ranks by descending order, # using the direct urls of the pages instead of the doc id # -Marinette if __name__ == "__main__": redisConnection = redis.Redis() bot = crawler(redisConnection, "urls.txt") bot.crawl(depth=1) print "Printing Page Ranks:" pprint.pprint(bot.crawler_page_ranks())
from crawler import crawler from pagerank import page_rank # Get crawler object and crawl on urls found in urls.txt crawler = crawler(None, 'urls.txt') crawler.crawl() document_index = crawler.get_document_index() # Run pagerank on the links generated by the crawler pagerank = page_rank(crawler._links) for doc_id, rank in sorted(pagerank.iteritems(), key=lambda (k,v): (v,k), reverse=True): document = crawler._document_index[doc_id] print str(rank) + " : " + str(document[0]) + "\n"
import os import sys from crawler import crawler print "Now starting the test...." print crawler = crawler(None, "urls.txt") inverted_index = crawler.get_inverted_index() print print "inverted_index is......" print print inverted_index print print resolved_inverted_index = crawler.get_resolved_inverted_index() print "resolved_inverted_index is......" print print resolved_inverted_index
import requests, pprint, json, urllib, os, functools, ssl, time import pandas as pd #from geopy import geocoders from bs4 import BeautifulSoup from copy import deepcopy import ssl import crawler key = "AIzaSyASmGMElEZthlsMGEN-p3Nw1NInctWoXTk" types = "restaurant" radius = "1000" fields = "name,formatted_address,rating" inputString = "pizzeria" crawl = crawler.crawler(types=types, inputString=inputString, radius=radius, key=key) dataOfInterest = { "name", "geometry", "place_id", "rating", "types", "vicinity", "reviews" } manual = """ Hello! Press 1 for crawling cities. [There are 144 cities in Italy] Press 2 for crawling communes. [TIME CONSUMING!!! There are 8100 communes in Italy] Default: Exit! """ choices = [None, 'Cities', 'Communes'] option = input(manual) def initiate(wid):
import crawler if __name__ == "__main__" : Clips = [] # 클립 search_word = '방법 1화' # '방법 1화', '관찰카메라24 119화', '반의반 1화' vod_no = 1 crawler.crawler(Clips, search_word, vod_no) # //*[@id="player"]/div/div[1]/div[11]/div[13]/video/source # blob 처리
archives_url = "http://localhost" if option_url: archives_url = option_url root = archives_url createStructure() depth = 1 try: depth = int(option_crawler.strip().split()[0]) except (ValueError, IndexError, AttributeError): depth = 0 try: try: crawler(archives_url, depth) except IOError as e: print("Cannot open the url = %s" % archives_url) print(e.strerror) sys.exit(1) if len(database.keys()) < 1: print("No information found!") sys.exit(1) else: print( "Starting investigation for the given URL...Please wait..." ) if option_xss: investigate(archives_url)
def __init__(self): self.crawler = crawler() self.first_run = True self.date = datetime.datetime.now().date()
def scan(self): print "Requesting '%s'..." % (self.URL) extHeader = "" code, headers = self.doRequest(self.URL, self.config["p_useragent"], self.config["p_post"], self.config["header"], self.config["p_ttl"]) if (headers != None): for head in headers: if head[0] in ("set-cookie", "set-cookie2"): cookie = head[1] c = Cookie.SimpleCookie() c.load(cookie) for k, v in c.items(): extHeader += "%s=%s; " % (k, c[k].value) if (code == None): print "Code == None!" print "Does the target exist?!" print "AutoAwesome mode failed. -> Aborting." sys.exit(1) if (extHeader != ""): print "Cookies retrieved. Using them for further requests." extHeader = extHeader.strip()[:-1] if (self.config["header"].has_key("Cookie") and extHeader != ""): print "WARNING: AutoAwesome mode got some cookies from the server." print "Your defined cookies will be overwritten!" if (extHeader != ""): print "Testing file inclusion against given cookies..." self.config["header"]["Cookie"] = extHeader single = singleScan(self.config) single.setURL(self.URL) single.setQuite(True) single.scan() soup = BeautifulSoup.BeautifulSoup(''.join(code)) idx = 0 for form in soup.findAll("form"): idx += 1 caption = None desturl = None method = None if (soup.has_key("action")): desturl = soup["action"] else: desturl = self.URL if (form.has_key("name")): caption = form["name"] else: caption = "Unnamed Form #%d" % (idx) if (form.has_key("method")): if (form["method"].lower() == "get"): method = 0 else: method = 1 else: method = 1 # If no method is defined assume it's POST. params = "" for input in form.findAll("input"): if (input.has_key("name")): input_name = input["name"] input_val = None if (input.has_key("value")): input_val = input["value"] if (input_val == None): params += "%s=&" % (input_name) else: params += "%s=%s&" % (input_name, input_val) else: print "An input field doesn't have an 'name' attribute! Skipping it." if ("&" in params): params = params[:-1] print "Analyzing form '%s' for file inclusion bugs." % (caption) modConfig = deepcopy(self.config) if (method == 0): # Append the current get params to the current URL. if ("?" in desturl): # There are already params in the URL. desturl = "%s&%s" % (desturl, params) else: # There are no other params. desturl = "%s&?%s" % (desturl, params) else: currentPost = modConfig["p_post"] if (currentPost == None or currentPost == ""): currentPost = params else: currentPost = currentPost + "&" + params modConfig["p_post"] = currentPost single = singleScan(modConfig) single.setURL(desturl) single.setQuite(True) single.scan() print "Starting harvester engine to get links (Depth: 0)..." crawl = crawler(self.config) crawl.crawl_url(self.URL, 0) if (len(crawl.urlpool) == 0): print "No links found." else: print "Harvesting done. %d links found. Analyzing links now..." % ( len(crawl.urlpool)) for url in crawl.urlpool: try: single = singleScan(self.config) single.setURL(str(url[0])) single.setQuite(True) single.scan() except: print "Cought an exception. Continuing..." print "AutoAwesome is done."
m = massScan(config) m.startMassScan() show_report() elif (config["p_mode"] == 2): print("GoogleScanner is searching for Query: '%s'" % config["p_query"]) g = googleScan(config) g.startGoogleScan() show_report() elif (config["p_mode"] == 3): print( "Crawler is harvesting URLs from start URL: '%s' with depth: %d and writing results to: '%s'" % (config["p_url"], config["p_depth"], config["p_write"])) c = crawler(config) c.crawl() elif (config["p_mode"] == 4): print("AutoAwesome mode engaging URL '%s'..." % (config["p_url"])) awe = autoawesome.autoawesome(config) awe.setURL(config["p_url"]) awe.scan() elif (config["p_mode"] == 5): print("BingScanner is searching for Query: '%s'" % config["p_query"]) b = bingScan(config) b.startGoogleScan() show_report()
def main(title: str): title = str(title) fps = config['fps'] result, audio_url = crawler.crawler(title) width = config['width'] height = config['height'] for key in result.keys(): image_name = str(key) image_url = result[key]['image_url'] image_dir = os.sep.join([".", "resource", title]) crawler.save_image(image_url, image_dir, image_name) fourcc = VideoWriter_fourcc(*'mp4v') output_dir = os.sep.join(['.', 'output']) if not os.path.exists(output_dir): print("Folder", output_dir, 'does not exist. Creating...') os.makedirs(output_dir) video = VideoWriter(os.sep.join([output_dir, str(title) + '.mp4']), fourcc, float(config['fps']), (config['width'], config['height'])) font = ImageFont.truetype(config['font'], config['title_font_size'], encoding="utf-8") font2 = ImageFont.truetype(config['font'], config['content_font_size'], encoding="utf-8") title_wrapper = text_processing.Wrapper(font) content_wrapper = text_processing.Wrapper(font2) keys = list(result.keys()) keys.append(0) keys.sort() keys.append(keys[len(keys) - 1] + 10) print(keys) frame = image_processing.create_blank_frame("", "", (width, height), title_wrapper, content_wrapper, font, font2) total_length = keys[len(keys) - 1] * fps index = 0 for i in range(total_length): if (index + 1 > len(keys) - 1): frame = image_processing.create_blank_frame( "", "", (width, height), title_wrapper, content_wrapper, font, font2) elif (i / fps) > keys[index + 1]: index += 1 print(index, "out of", len(keys)) key = keys[index] image = image = os.sep.join([ '.', 'resource', title, str(key) + text_processing.find_image_suffix(result[key]['image_url']) ]) header = result[key]['header'] content = result[key]['content'] print("标题:", header) if (result[key]['image_suffix'] in ['.gif', '.GIF']): frame = image_processing.create_blank_frame( header, content, (width, height), title_wrapper, content_wrapper, font, font2) else: frame = image_processing.create_frame(image, header, content, (width, height), title_wrapper, content_wrapper, font, font2) os.remove(image) else: "" video.write(frame) print(title, "finished!")
# -*- coding: utf-8 -*- import os import sys from config import init_file from crawler import crawler from initdb import initdb if __name__ == '__main__': if not os.path.exists(init_file): print('Initializing database...') initdb() if len(sys.argv) > 1 and sys.argv[1] == '--clean': print('Initializing database...') initdb() while True: #try: crawler() #except Exception as e: # print(e)
import crawler import searcher pages = ['https://www.codechef.com/'] C = crawler.crawler('codechef.db') C.createindextables() print "Crawling :: \n" C.crawl(pages) print "Ranking Pages :: \n" C.calculatepagerank() S = searcher.searcher('codechef.db') searchQuery = 'Saturday' S.query(searchQuery)
#! -*-coding:utf-8 -*- # 作者:泽同学 # blog:www.orze.top from proxy.text import * import threading from crawler.crawler import * sys.path.append("/Users/wangzeqing/Desktop/python/玩玩/bilibili") from db.dborder import * lock = threading.Lock() dbcon = mysqlconnect(lock) p = threading.Thread(target=dbcon.print_list) p.start() print("+++++++++++++++++++++++++++++++++++\n开始验证proxy表中的数据") #kong('proxy',dbcon) print("+++++++++++++++++++++++++++++++++++\n开始爬虫进程") crawler('proxy',dbcon) print("+++++++++++++++++++++++++++++++++++\n开始测试temporary中的数据") kong('temporary',dbcon) print("+++++++++++++++++++++++++++++++++++\n开始验证recycle表中的数据") kong('recycle',dbcon)
def search(): # Check cookies to see if we have previously saved searches url_cookie = request.cookies.get('urls') # Check cookies for errors url_error = request.cookies.get('url_error') keyword_error = request.cookies.get('keyword_error') # Use this delimiter for urls when they're saved as a string delimiter = ", " # Post handler - if the user has posted data from the form to this url: if request.method == 'POST': # Get variables from the form url = request.form['starting_url'] method = request.form['method'] depth = request.form['depth'] keyword = request.form['keyword'] # FIXME Make form object to send to crawler?? form_data = { 'starting_url': url, 'method': method, 'depth': depth, 'keyword': keyword } # Validate url if url_validator(url): # Validate keyword if keyword_validator(keyword): # # FIXME Trace statements (DELETE) # print("Starting url: %s" %url) # print("Method: %s" %method) # print("Depth: %s" %depth) # print("Keyword: %s" %keyword) # Call crawler # crawler_thread = threading.Thread(target=crawl.crawler, args=form.data) # crawler_thread.start() # app.logger.info(form.data) # crawl.crawler(form.data) # Call function to perform crawl using the Form submissions on the the search routes crawl.crawler(url, method, depth, keyword) # Use make_response to create response object so we can set cookies # Create response object that redirects to 'results' url response = make_response(redirect(url_for('results', code=307))) # If url history cookie is already set, append the new url to the cookie string if url_cookie: if url not in url_cookie: # FIXME append url to cookie string with ", " delimiter url_cookie += ", " + url response.set_cookie('urls', url_cookie) # Else, if no 'urls' cookie yet, create 'urls' cookie and add new url else: response.set_cookie('urls', url) # Set the cookie and redirect to the results page return response # Else if keyword is invalid, redirect back to search page and display keyword warning else: # Set error message to be displayed on search form keyword_error = "Invalid keyword submitted. Please enter a single word, letters only" # Flash the error message to session cookie and redirect back to page flash(keyword_error) return redirect(url_for('search')) # Else if url is not valid, redirect back to search page and display url error else: # Set error message to be displayed on search form url_error = "Invalid URL submitted. Please enter a valid URL" # Flash the error message to session cookie and redirect back to page flash(url_error) return redirect(url_for('search')) # Else if the user arrived via GET request from homepage, render the search form else: # Instantiate url_list, url_error, keyword_error to None url_list = None # Check for previously saved searches to save as list in url_list to be # used in dropdown input form if url_cookie: # Split into list to send to template url_list = url_cookie.split(delimiter) # Render the search form template with either a list of url's or nothing return render_template('search.html', url_list=url_list, url_error=url_error, keyword_error=keyword_error)