Exemplo n.º 1
0
 def test_crawler(self):
     crawler.crawler(['-b', 'PublicServan', '-i', '1', '2'])
     filename = 'PublicServan-1-2.json'
     with codecs.open(filename, 'r', encoding='utf-8') as f:
         data = json.load(f)
         # M.1127808641.A.C03.html is empty, so decrease 1 from 40 articles
         self.assertEqual(len(data['articles']), 39)
     os.remove(filename)
Exemplo n.º 2
0
def crawler_task(*args, **kwargs):
	# Permite la ejecucion de manera asincronica la busqueda de palabras
    url = kwargs.get('url')
    max_visited = kwargs.get('max_visited')
    session_number = kwargs.get('session_number')
    website_id = kwargs.get('website_id')

    logger.debug( 'Using {} {} for crawler_task'.format(url, max_visited ))
    urls = [url]
    visited = [url]

    crawler(url=url, urls=urls, visited=visited, max_visited=max_visited, session_number=session_number, website_id=website_id)
Exemplo n.º 3
0
 def test_resolvedIndex2(self):
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     resolved_inverted_index_dict = self._crawler.get_resolved_inverted_index()
     self.assertEqual(resolved_inverted_index_dict['page'], set(['http://individual.utoronto.ca/peixizhao/', u'http://individual.utoronto.ca/peixizhao/branch1.html']))
Exemplo n.º 4
0
def job():
    logging.info("start cron")
    res = crawler()
    try:
        with open("/app/application/grade.json") as f:
            last = json.load(f)
    except IOError:
        with open("/app/application/grade.json", "w") as f:
            json.dump(res, f)
            return

    if last != res:
        with open("/app/application/grade.json", "w") as f:
            json.dump(res, f)

        diff = dict(set(res.items()) ^ set(last.items()))
        t = ""
        for k, v in diff.iteritems():
            k = k.encode("utf-8")
            v = v.encode("utf-8")
            t += k + ":" + v + "\n"
        if notification == "mail" :
            sendmail(t)
        elif notification == "twitter" :
            sendtwit(t)
Exemplo n.º 5
0
    def copymagnet2clipboard(self, event):  # wxGlade: MainFrame.<event_handler>
        page = self.SearchNotebook.GetCurrentPage()
        if page:
            selected = page.torrList.GetSelectedObject() 
            if selected and selected.magneturl:
                plugin_file = open('plugins/'+ selected.plugin + '.json','r')
                plugin = json.loads(plugin_file.read())
                plugin_file.close()
                    
                if 'crawler' in plugin['magnet_url_filter']:
                    config = self.ReadConfig()
                    try:
                        magneturl = crawler(selected.magneturl,plugin['magnet_url_filter']['crawler'],plugin['headers'],config['timeout'])
                    except StandardError,msg:
                        self.ReportError("Can\'t establish a connection. Reason:"+str(msg))
                        return
                else:
                    magneturl = selected.magneturl

                dataObj = wx.TextDataObject()
                dataObj.SetText(magneturl)
                if wx.TheClipboard.Open():
                    wx.TheClipboard.SetData(dataObj)
                    wx.TheClipboard.Close()
                    self.ReportInfo("Torrent's Magnet Url has been copied to the clipboard.")
                else:
                    self.ReportError("Unable to open the clipboard")
            else:
                self.ReportInfo("Plugin didn't fetch any magnet URL")
    def test_crawler(self):
        """
        Tests the crawler by comparing its results to manually verified results.
        """
        global test_case_result

        # Run the crawler and store the results.
        bot = crawler(None, "test.txt")
        bot.crawl(depth=1)
        inverted_index = bot.get_inverted_index()
        resolved_inverted_index = bot.get_resolved_inverted_index()
        
        # Check that the result contains the correct number of words.
        self.assertTrue(len(resolved_inverted_index) == len(test_case_result), "incorrect number of words found.")

        for key in resolved_inverted_index:
            # Check that each word is in the precomputed results.
            self.assertTrue(key in test_case_result, "unexpected word: {key}.".format(key = key))
            
            # Check that each word maps to the correct number of urls.
            self.assertTrue(
                len(resolved_inverted_index[key]) == len(test_case_result[key]),
                "incorrect number of urls for word: {key}.".format(key = key))

               
            for url in resolved_inverted_index[key]:
                # Check that each url is correct.
                self.assertTrue(url in test_case_result[key], "unexpected url: <{url}>.".format(url = url))
Exemplo n.º 7
0
    def setUp(self):
        mock_doc_index = {
            1: (1, 2, 3),
            2: (2, 3, 4, 5),
            3: (3, 4, 5, 1)
        }

        mock_word_cache = {
            'hello': 1,
            'world': 2,
            'jelly': 3,
            'beans': 4,
            'green': 5
        }

        mock_doc_cache = {
            'http://example.com': 1,
            'http://example.com/123': 2,
            'http://someotherexample.com': 3
        }

        self.bot = crawler(None, '')
        self.bot._doc_id_cache = mock_doc_cache
        self.bot._word_id_cache = mock_word_cache
        self.bot._doc_index = mock_doc_index
Exemplo n.º 8
0
def deploy():

    # run crawler
    print "Please wait while we are recreating the database"
    os.system("rm -f %s" % DB_FILE)
    db_conn = lite.connect(DB_FILE)
    bot = crawler.crawler(db_conn, URLS_TXT_FILE)
    bot.crawl(depth=2)
    print "Crawler Finished"  #change to decorator

    # aws setup
    print "Please wait while we are creating the instance"
    public_ip, instance_id, key_pair_path = aws_setup.setup()
    print "AWS Setup Finished"

    # scp
    print "Please wait while we setup the app in AWS"
    os.system("rm -rf ./bottle-0.12.7/data/") # delete cache for faster scp
    os.system("scp -r -o StrictHostKeyChecking=no -i %s ../csc326/ ubuntu@%s:~/" % (key_pair_path, public_ip))
    os.system("ssh -o StrictHostKeyChecking=no -i %s ubuntu@%s nohup python csc326/runner.py" % (key_pair_path, public_ip))
    print "App Launched"

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Public IP Address: %s" % public_ip
    print "Instance ID: %s" % instance_id
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

    return public_ip
Exemplo n.º 9
0
def update_crawl_data(battle):
    ''' Method used to update battle record with crawl stats and winner.
    Params:
        battle row object
    '''

    # do actual crawling and returns dict of data
    res = crawler(battle.hashtag1, battle.hashtag2)
    # sample res= {'num_tweet_winner': 'PURPOSETOUR',
                   #'tag2_num_tweets': '21', 'tag2_num_spell_errors': '68',
                   #'tag1_num_spell_errors': '82', 'tag1_num_tweets': '19',
                   #'num_spell_winner': 'PURPOSETOUR'}

    # update battle status to done if its end date has past
    crawl_status = 'R'
    tz_info = battle.battle_end.tzinfo
    if battle.battle_end <= datetime.now(tz_info):
        crawl_status = 'D'
    
    # updated battle record with crawled data
    Battle.objects.filter(battle_id=battle.battle_id).update(
            crawl_status=crawl_status,
            tag1_num_tweets=res['tag1_num_tweets'],
            tag2_num_tweets=res['tag2_num_tweets'],
            tag1_num_spell_errors=res['tag1_num_spell_errors'],
            tag2_num_spell_errors=res['tag2_num_spell_errors'],
            num_tweet_winner=res['num_tweet_winner'],
            num_spell_winner=res['num_spell_winner'],
    )
    return True
Exemplo n.º 10
0
    def Toolbar_TransmMagneturl(self, event): # wxGlade: MainFrame.<event_handler>
        page = self.SearchNotebook.GetCurrentPage()
        if page:
            selected = page.torrList.GetSelectedObject() 
            if selected and selected.magneturl:
                config = self.ReadConfig()
                data = config['transmission']

                plugin_file = open('plugins/'+ selected.plugin + '.json','r')
                plugin = json.loads(plugin_file.read())
                plugin_file.close()

                if 'crawler' in plugin['magnet_url_filter']:
                    config = self.ReadConfig()
                    try:
                        magneturl = crawler(selected.magneturl,plugin['magnet_url_filter']['crawler'],plugin['headers'],config['timeout'])
                    except StandardError,msg:
                        self.ReportError("Can\'t establish a connection. Reason:"+str(msg))
                        return
                else:
                    magneturl = selected.magneturl

                if not self.tc:
                    try:
                        self.tc = transmissionrpc.Client(address=data['host'],port=data['port'],user=data['user'],password=data['pass'])
                    except transmissionrpc.TransmissionError,original:
                        msg = "Can\'t connect to Transmission client:" + str(original)
                        self.ReportError(msg)
                        return -1
                try:
                    self.tc.add_uri(magneturl)
                    self.ReportInfo(str('Magnet URL successfully sent to Transmission!'))
                except transmissionrpc.TransmissionError,original:
                    msg = "Can\'t add magnet to Transmission:" + str(original)
                    self.ReportError(msg)
Exemplo n.º 11
0
 def test_crawl_depth_0_invertedIndex(self):
     """If the depth is 0 then only the words from main page should be crawled"""
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt')
     self._crawler.crawl(depth = 0)
     self.assertEqual(self._crawler.get_inverted_index(), {1: set([1]), 2: set([1]), 3: set([1]), 4: set([1])
         , 5: set([1]), 6: set([1]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1])})
Exemplo n.º 12
0
 def test_invertedIndex2(self):
     """test individual element in the returned result"""
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     inverted_index_dict = self._crawler.get_inverted_index()
     self.assertEqual(inverted_index_dict[3], set([1,2]))
Exemplo n.º 13
0
def main(urls_file, _depth):
    drop_db()
    bot = crawler(None, urls_file)
    bot.crawl(depth=_depth)

    doc_id_index(bot.get_links(), bot.get_inverted_doc_id_cache(), bot.get_url_description())

    word_id_index(bot.get_word_id(), bot.get_inverted_index())
Exemplo n.º 14
0
 def test_invertedIndex(self):
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     self.assertEqual(self._crawler.get_inverted_index(), {1: set([1, 2]), 2: set([1]), 3: set([1, 2]), 4: set([1, 2])
         , 5: set([1, 2]), 6: set([1, 2]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1]), 11: set([2]),
                                                           12: set([2]), 13: set([2]), 14: set([2]), 15: set([2]), 16: set([2])})
Exemplo n.º 15
0
 def test_crawler(self):
     crawl_data = crawler(self.tag1, self.tag2)
     self.assertTrue('tag2_num_tweets' in crawl_data)
     self.assertTrue('tag1_num_tweets' in crawl_data)
     self.assertTrue('tag1_num_spell_errors' in crawl_data)
     self.assertTrue('tag2_num_spell_errors' in crawl_data)
     self.assertTrue('num_spell_winner' in crawl_data)
     self.assertTrue('num_tweet_winner' in crawl_data)
	def start_crawling(self): #method to start crawling
		seed=self.seed.get("1.0","end-1c")
		#tkMessageBox.showinfo("Title",seed)
		craw=crawler(seed)
		count=0
		message=""

		#crawling algorithm here :) :) 
		while not craw.is_empty() and count<5 : #configure count's value according to your choice
			count=count+1
			url=craw.remove_url()
			if not craw.is_visited(url):
				craw.add_visited(url)
				pattern=re.compile('http://*')
				
				pattern1=re.compile('.*pdf') #ignoring pdf 
				pattern2=re.compile('.*gif')  #ignoring gif
				pattern3=re.compile('.*jpg')  #ignoring jpg
				pattern4=re.compile('.*jpeg') #ignoring jpeg
				pattern5=re.compile('.*link') #ignoring  forwarding error 503 to some extent
				
				matcher=pattern.match(url)
				matcher1=pattern1.match(url)
				matcher2=pattern2.match(url)
				matcher3=pattern3.match(url)
				matcher4=pattern4.match(url)
				matcher5=pattern5.match(url)

				if matcher and not (matcher1 or matcher2 or matcher3 or matcher4 or matcher5):
					print url + "\nThis is a valid url \nGoing to crawl it NOW !"
					
					#start crawling here
					request=urllib2.Request(url)
					response=urllib2.urlopen(request)
					#content=response.decode('utf-8')
					#could handle exception here
					content=response.read()  #getting the content from the current uniform resource locator :) cool stuff
					#parsing the html content to get all the href links from current page
					soup=BeautifulSoup(content) #creating soup of content
					#write code to save this content in some file on the disk 
					filename="/home/bhaskar/Documents/programming/python/pythonGuiTkinter/crawlerProject/crawledFiles/"+soup.title.string+".txt"
					fileObject=open(filename,"wb")
					fileObject.write(soup.prettify())
					print "URL's found in this Page are :"
					#enqueuing all the href links from the current page/resource
					for link in soup.findAll('a'):
						craw.add_url(link.get('href'))
						print link.get('href')
				
				#terminating for testing 
				else:
					#message=message+"\n\n"+url + "\n\nThis is a invalid url \nNot going to crawl it NOW !"
					print url+"\nThis is invalid url\nNot going to crawl it !"	
				
		print "I have reached the end"
		self.show.config(state="normal") #enabling the show downloaded files button
Exemplo n.º 17
0
 def test_Lexicon(self):
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     with self._crawler._db_conn:
         c = self._crawler._db_conn.cursor()
         c.execute('SELECT * FROM Lexicon WHERE words=?', ('facebook',))
         result = c.fetchone()[1]
         self.assertEqual(result, 'facebook')
Exemplo n.º 18
0
 def test_Document(self):
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     with self._crawler._db_conn:
         c = self._crawler._db_conn.cursor()
         c.execute('SELECT * FROM Document WHERE doc_url=?', ('http://individual.utoronto.ca/peixizhao/',))
         result = c.fetchone()[1]
         self.assertEqual(result, 'http://individual.utoronto.ca/peixizhao/')
Exemplo n.º 19
0
 def do_GET(self):
     parse_object = query_parser(parser(self.path).query)['url'][0] #gets URLs from query parameters
     requests_response = crawler(parse_object)
     if requests_response != 'ERR':
         self.send_response(200)
         self.send_header("Content-type", requests_response[-1])
         self.send_header("Access-Control-Allow-Origin", '*')
         self.end_headers()
         self.wfile.write(requests_response[0]) # sends actual content to be displayed
     else:
         self.send_error(400)
Exemplo n.º 20
0
def battle_result(request, id=None):
    ''' Method for showing battle result for selected battle id.
    Params:
        request <django request object>
        id <integer> battle id
    '''

    instance = get_object_or_404(Battle, battle_id=id)
    res = crawler(instance.hashtag1, instance.hashtag2)
    messages.success(request, res, extra_tags='html_safe')
    return redirect('battles:list')
Exemplo n.º 21
0
    def test_resolved_inverted_index_(self):
        #Create object crawler with an empty text file
        con = lite.connect("dbFile.db")
        
        c = crawler(None, "urls_test.txt")
		 
        c.crawl()
        #Check inverted index
        self.assertEqual(c._inverted_index_str[u'languages'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']))
        self.assertEqual(c._inverted_index_str[u'csc326'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']))
        self.assertEqual(c._inverted_index_str[u'programming'], set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']))
Exemplo n.º 22
0
	def testOne(self):
		bot = crawler(None, "urls.txt")
		bot.crawl(depth=2)
		inverted_index = bot.inverted_index()
		resolved_inverted_index = bot.resolved_inverted_index()

		expected_inverted_index = {1: set([1, 2]), 2: set([1, 2])}
		expected_resolved_inverted_index = {u'index2': set(['http://hhaider.github.io/mygithubpage/index.html', u'http://hhaider.github.io/mygithubpage/index2.html']), u'index': set(['http://hhaider.github.io/mygithubpage/index.html', u'http://hhaider.github.io/mygithubpage/index2.html'])}

		self.failUnless(inverted_index == expected_inverted_index)
		self.failUnless(resolved_inverted_index == expected_resolved_inverted_index)
Exemplo n.º 23
0
def test_multi_thread_crawler():
    """
    1) test multithread crawler on test.html
    2)
        a) single thread crawl http://www.eecg.toronto.edu
        b) multi thread crawl http://www.eecg.toronto.edu
        c) compare result
        NOTE: single thread and multi thread result may be different
        due to different timeout, so please try multiple runs
    """
    try:
        #1) test local testcase
        print "    test multi thread on test.html"
        bot = cmt.crawler(None, "test_url.txt")
        bot.crawl(depth=0)
        assert len(bot.get_links()) == 2
        assert len(bot.get_word_id()) == 15
        assert len(bot.get_inverted_index()) == 15

        # 2)compare again single thread result

        # a)single thread crawl http://www.eecg.toronto.edu
        print "    compare multi thread result with single thread"
        start_time = time.time()
        single = cs.crawler(None, "urls.txt")
        single.crawl(depth=1)
        single_time = time.time() - start_time

        # b)multi thread crawl http://www.eecg.toronto.edu
        start_time = time.time()
        multi = cmt.crawler(None, "urls.txt")
        multi.crawl(depth=1)
        multi_time = time.time() - start_time

        delta = single_time - multi_time
        print "/////IMPROVE//////////"
        print "//////%d secs/////////" % delta
        print "////////////////////"

        # c)compare result
#        print "####Compare num of links"
        print "links"
        assert abs(len(single.get_links()) - len(multi.get_links())) < ALLOWRANCE
#        print "####Compare num of word id"

        print "word_id"
        assert abs(len(single.get_word_id()) - len(multi.get_word_id())) < ALLOWRANCE
#        print "####Compare num of inverted index"
        print "inverted"
        assert abs(len(single.get_inverted_index()) - len(multi.get_inverted_index())) < ALLOWRANCE
    except:
        logging.exception("")
        return False
    return True
Exemplo n.º 24
0
def test_inverted_index():
    print "Test Inverted Index"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'urls3.txt')
    c.crawl()
    expected_inverted_index = {1: set([1]), 2: set([1]), 3: set([1])}
    if c.get_inverted_index() == expected_inverted_index:
        print "Success!"
    else:
        print "Fail! Wrong inverted_index"
Exemplo n.º 25
0
def test_resolved_inverted_index():
    print "Test Resolved Inverted Index"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'urls3.txt')
    c.crawl(depth=1)
    expected_resolved_inverted_index = {u'languages': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']), u'csc326': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html']), u'programming': set(['http://www.eecg.toronto.edu/~jzhu/csc326/csc326.html'])}
    if c.get_resolved_inverted_index() == expected_resolved_inverted_index:
        print "Success!"
    else:
        print "Fail! Wrong resolved_inverted_index"
Exemplo n.º 26
0
def test_empty_resolved_inverted_index():
    print "Test Empty Resolved Inverted Index"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'invalid.txt')
    c.crawl()
    expected_resolved_inverted_index = {}
    if c.get_resolved_inverted_index() == expected_resolved_inverted_index:
        print "Success!"
    else:
        print "Fail! With invalid *.txt file, crawler must have empty resolved_inverted_index"
Exemplo n.º 27
0
def test_crawler_db_results():
    print "Test Crawler Database"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'urls2.txt')
    c.crawl()
    expected_urls = [u'http://help.websiteos.com/websiteos/example_of_a_simple_html_page.htm']
    if crawler_db.get_sorted_urls("head", "dbFile_tester.db") == expected_urls:
        print "Success!"
    else:
        print "Fail! Wrong crawler_db results"
Exemplo n.º 28
0
def test_inverted_index_with_two_urls():
    print "Test Inverted Index with Two URLs"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'urls2.txt')
    c.crawl()
    expected_inverted_index = {1: set([1]), 2: set([1]), 3: set([1]), 4: set([1]), 5: set([1]), 6: set([1]), 7: set([1]), 8: set([1]), 9: set([1]), 10: set([1]), 11: set([1]), 12: set([1]), 13: set([1]), 14: set([1]), 15: set([1]), 16: set([1]), 17: set([1]), 18: set([1]), 19: set([1]), 20: set([1]), 21: set([1]), 22: set([1]), 23: set([1]), 24: set([1]), 25: set([1]), 26: set([1]), 27: set([1]), 28: set([1]), 29: set([1]), 30: set([1]), 31: set([1]), 32: set([1]), 33: set([1]), 34: set([1]), 35: set([1]), 36: set([1]), 37: set([1]), 38: set([1]), 39: set([1]), 40: set([1]), 41: set([1]), 42: set([1]), 43: set([1]), 44: set([1]), 45: set([1]), 46: set([1]), 47: set([1]), 48: set([1]), 49: set([1]), 50: set([1]), 51: set([1]), 52: set([1]), 53: set([1]), 54: set([1]), 55: set([1]), 56: set([1]), 57: set([1]), 58: set([1]), 59: set([1]), 60: set([1]), 61: set([1]), 62: set([1]), 63: set([1]), 64: set([1]), 65: set([1]), 66: set([1]), 67: set([1]), 68: set([1]), 69: set([1]), 70: set([1]), 71: set([1]), 72: set([1]), 73: set([1]), 74: set([1]), 75: set([1]), 76: set([1]), 77: set([1]), 78: set([1]), 79: set([1]), 80: set([1]), 81: set([1]), 82: set([1]), 83: set([1]), 84: set([1]), 85: set([2]), 86: set([2]), 87: set([2])}
    if c.get_inverted_index() == expected_inverted_index:
        print "Success!"
    else:
        print "Fail! Wrong inverted_index"
Exemplo n.º 29
0
 def test_PageRank_branch(self):
     with open('test_urls.txt', 'w') as f:
         f.write("http://individual.utoronto.ca/peixizhao/")
     self._crawler = crawler(None, 'test_urls.txt', 'test.db')
     self._crawler.crawl(depth = 1)
     with self._crawler._db_conn:
         c = self._crawler._db_conn.cursor()
         c.execute('SELECT * FROM Document WHERE doc_url=?', ('http://individual.utoronto.ca/peixizhao/branch1.html',))
         branch_doc_id = c.fetchone()[0]
         c.execute('SELECT rank FROM PageRank WHERE DocId=?', (branch_doc_id,))
         result = c.fetchone()[0]
         self.assertEqual(result, 0.0)
Exemplo n.º 30
0
	def testOne(self):
		bot = crawler(None, "urls.txt")
		bot.crawl(depth=2)
		inverted_index = bot.inverted_index()
		print inverted_index
		expected_inverted_index = {1: set([1, 3]), 2: set([1]), 3: set([2])}

		
		got_page_rank = page_rank(bot.links())
		expected_page_rank = {1: 0.05000000000000001, 2: 0.092500000000000027, 3: 0.12862500000000002}
		

		self.failUnless(inverted_index == expected_inverted_index)
		self.failUnless(got_page_rank == expected_page_rank)
Exemplo n.º 31
0
 def crawler(url):
     from crawler import crawler
     f13_data = crawler(url)
     return f13_data
Exemplo n.º 32
0
from flask import Flask, render_template, request, redirect
import searchengine, neuralnet, crawler
searcher = searchengine.searcher('searchengine.db')
crawler = crawler.crawler('searchengine.db')
nnet = neuralnet.searchnet('nn.db')


app = Flask(__name__)


@app.route("/")
def search():
	if request.args:
		queryText = request.args.get('q')
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryText)
		if len(urlIdsList) != 0:
			listOfItems = [{'id': urlIdsList[i], 'url': urlsList[i], 'score': scores[i]} for i in range(len(urlIdsList))]
		else:
			listOfItems = []
		return render_template('index.html', list=listOfItems, q=queryText)
	return render_template('index.html', list=None)


@app.route('/train', methods=['POST', 'GET'])
def train():		
	if request.method == 'POST':
		queryPhrase = request.json['q']
		selectedURLId = int(request.json['clicked'])
		app.logger.debug('queryPhrase: %s => selectedURLId: %s' %(queryPhrase, selectedURLId))
		(wordids, scores, urlIdsList, urlsList) = searcher.query(queryPhrase)
		nnet.trainquery(wordids, urlIdsList, selectedURLId)
Exemplo n.º 33
0
import time
import datetime
from crawler import crawler
from parser import parser
from html_generator import gen_html

todays_date = datetime.datetime.now().date()
#~ name = "{}_{}_{}".format(todays_date.month,todays_date.day,todays_date.year)
name = "new"
print "Crawling started:"
crawler(name)
print "Parsing Started:"
parser(name)
print "Generating HTML:"
gen_html(name)
print "Done!"
Exemplo n.º 34
0
import crawler

ptt = crawler.crawler()
ptt.setup_keyword('ptt')
#batch_id = ptt.get_link_session()
#ptt.crawl_ptt_link(batch_id)
#ptt.close_conn()
Exemplo n.º 35
0
import crawler
import houses_parse
import pymysql
import getregions

if __name__ == '__main__':
    response = crawler.crawler(
        'https://bj.lianjia.com/ershoufang/city?city_id=110000')
    regions_info = getregions.getinfos()
    base_url = 'https://m.lianjia.com/bj/ershoufang/'

    for city_region, city_region_info in regions_info.items():
        for region, url in city_region_info.items():
            i = 1
            while True:
                houses_url = base_url + url + '/pg' + str(i)
                houses_info_response = crawler.crawler(houses_url)
                result = houses_parse.parse_ishashouse(houses_info_response)
                if len(result) != 0:
                    break
                houses_parse.parse_houseinfo(city_region, region,
                                             houses_info_response)
                i = i + 1
# -*- coding: utf-8 -*-
"""
Created on Fri Oct  4 07:35:36 2019

@author: pathouli
"""

from crawler import crawler

my_path = 'C:/Users/Timothy/Google Drive/TC Stuff/Analytics/GR 5067 - Natural Language Processing in Social Sciences/HW2/files_q1'
the_query = 'qmss columbia'
num_docs = 50

my_func = crawler()

my_func.write_crawl_results(my_path, the_query, num_docs)
Exemplo n.º 37
0
from crawler import crawler
import pprint
import sqlite3 as sql

if __name__ == "__main__":
    dbFile = 'dbFile1.db'
    crawler(dbFile, "urls.txt")
    con = sql.connect(dbFile)
    cur = con.cursor()
    query = """
        SELECT docIndex.url, pageRank.score
        FROM pageRank, docIndex
        WHERE pageRank.docid = docIndex.docid
        ORDER BY pageRank.score DESC"""
    cur.execute(query)
    ranks = cur.fetchall()
    con.close()
    print "Page Rank Scores per URL:"
    pprint.pprint(ranks)
Exemplo n.º 38
0
def main():
    crawler.crawler("/wiki/PageRank")
    pagerank(crawler.graph, 0.3)
Exemplo n.º 39
0
 def __init__(self):
     self.redis = redisclient()
     self.crawler = crawler()
Exemplo n.º 40
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
Exemplo n.º 41
0
    def setUp(self):
        self.bot = crawler(None, '')

    # def test_word_id(self):

        # insert a few words into the lexicon,
        # and check that _word_id_cache and _revert_word_id
        # map each word to its word_id correctly

        self.assertEqual(self.bot.word_id('apple'), 1)
        self.assertEqual(self.bot.word_id('lemon'), 2)
        self.assertEqual(self.bot.word_id('mango'), 3)
        self.assertEqual(self.bot.word_id('melon'), 4)
        self.assertEqual(self.bot.word_id('peach'), 5)

        self.assertEqual(self.bot._word_id_cache['apple'], 1)
        self.assertEqual(self.bot._word_id_cache['lemon'], 2)
        self.assertEqual(self.bot._word_id_cache['mango'], 3)
        self.assertEqual(self.bot._word_id_cache['melon'], 4)
        self.assertEqual(self.bot._word_id_cache['peach'], 5)

        self.assertEqual(self.bot._revert_word_id[1], 'apple')
        self.assertEqual(self.bot._revert_word_id[2], 'lemon')
        self.assertEqual(self.bot._revert_word_id[3], 'mango')
        self.assertEqual(self.bot._revert_word_id[4], 'melon')
        self.assertEqual(self.bot._revert_word_id[5], 'peach')

    # def test_doc_id(self):

        # insert a few URLs into the document index,
        # and check that _doc_id_cache and _revert_doc_id
        # map each URL to its doc_id correctly

        self.assertEqual(self.bot.document_id('google.com'), 1)
        self.assertEqual(self.bot.document_id('facebook.com'), 2)
        self.assertEqual(self.bot.document_id('instagram.com'), 3)

        self.assertEqual(self.bot._doc_id_cache['google.com'], 1)
        self.assertEqual(self.bot._doc_id_cache['facebook.com'], 2)
        self.assertEqual(self.bot._doc_id_cache['instagram.com'], 3)

        self.assertEqual(self.bot._revert_doc_id[1], 'google.com')
        self.assertEqual(self.bot._revert_doc_id[2], 'facebook.com')
        self.assertEqual(self.bot._revert_doc_id[3], 'instagram.com')

    # def test_add_words_to_document(self):

        # pretend that crawl() has just visited the web page,
        # and now insert words that are found to the document

        self.bot._curr_doc_id = 1
        self.bot._curr_words = [(1, 1), (2, 1), (3, 1)]
        self.bot._add_words_to_document()

        self.bot._curr_doc_id = 2
        self.bot._curr_words = [(2, 1), (3, 1), (4, 1)]
        self.bot._add_words_to_document()

        self.bot._curr_doc_id = 3
        self.bot._curr_words = [(3, 1), (4, 1), (5, 1)]
        self.bot._add_words_to_document()

    # def test_doc_index(self):

        expected_doc_index = {
            1: set([1, 2, 3]),
            2: set([2, 3, 4]),
            3: set([3, 4, 5]),
        }
        self.assertEqual(expected_doc_index, self.bot.get_doc_index())

    # def test_inverted_index(self):

        expected_inverted_index = {
            1: set([1]),
            2: set([1, 2]),
            3: set([1, 2, 3]),
            4: set([2, 3]),
            5: set([3]),
        }
        self.assertEqual(expected_inverted_index, self.bot.get_inverted_index())

    # def test_resolved_inverted_index(self):

        expected_resolved_inverted_index = {
            'apple': set(['google.com']),
            'lemon': set(['google.com', 'facebook.com']),
            'mango': set(['google.com', 'facebook.com', 'instagram.com']),
            'melon': set(['facebook.com', 'instagram.com']),
            'peach': set(['instagram.com'])
        }
        self.assertEqual(expected_resolved_inverted_index, self.bot.get_resolved_inverted_index())
Exemplo n.º 42
0
def test_inverted_index_with_two_urls():
    print "Test Inverted Index with Two URLs"
    delete_db_file("dbFile_tester.db")
    db_conn = lite.connect("dbFile_tester.db")
    c = crawler(db_conn, 'urls2.txt')
    c.crawl()
    expected_inverted_index = {
        1: set([1]),
        2: set([1]),
        3: set([1]),
        4: set([1]),
        5: set([1]),
        6: set([1]),
        7: set([1]),
        8: set([1]),
        9: set([1]),
        10: set([1]),
        11: set([1]),
        12: set([1]),
        13: set([1]),
        14: set([1]),
        15: set([1]),
        16: set([1]),
        17: set([1]),
        18: set([1]),
        19: set([1]),
        20: set([1]),
        21: set([1]),
        22: set([1]),
        23: set([1]),
        24: set([1]),
        25: set([1]),
        26: set([1]),
        27: set([1]),
        28: set([1]),
        29: set([1]),
        30: set([1]),
        31: set([1]),
        32: set([1]),
        33: set([1]),
        34: set([1]),
        35: set([1]),
        36: set([1]),
        37: set([1]),
        38: set([1]),
        39: set([1]),
        40: set([1]),
        41: set([1]),
        42: set([1]),
        43: set([1]),
        44: set([1]),
        45: set([1]),
        46: set([1]),
        47: set([1]),
        48: set([1]),
        49: set([1]),
        50: set([1]),
        51: set([1]),
        52: set([1]),
        53: set([1]),
        54: set([1]),
        55: set([1]),
        56: set([1]),
        57: set([1]),
        58: set([1]),
        59: set([1]),
        60: set([1]),
        61: set([1]),
        62: set([1]),
        63: set([1]),
        64: set([1]),
        65: set([1]),
        66: set([1]),
        67: set([1]),
        68: set([1]),
        69: set([1]),
        70: set([1]),
        71: set([1]),
        72: set([1]),
        73: set([1]),
        74: set([1]),
        75: set([1]),
        76: set([1]),
        77: set([1]),
        78: set([1]),
        79: set([1]),
        80: set([1]),
        81: set([1]),
        82: set([1]),
        83: set([1]),
        84: set([1]),
        85: set([2]),
        86: set([2]),
        87: set([2])
    }
    if c.get_inverted_index() == expected_inverted_index:
        print "Success!"
    else:
        print "Fail! Wrong inverted_index"
from crawler import crawler
import sys
import random

crawler = crawler(None, "url-for-test.txt")
crawler.crawl(depth=1)

URL_1 = "https://marksachinperera.github.io/"
URL_2 = "https://marksachinperera.github.io/ContactMe.html"
URL_3 = "https://marksachinperera.github.io/AboutMe.html"

###****************####
ID_1 = -1
ID_2 = -1
ID_3 = -1
###****************####

print "getting inverted index"
inverted_index = crawler.get_inverted_index()

print "getting resolved index"
resolved_index = crawler.get_resolved_inverted_index()

print "Setting up"

word_list_1 = {
    "jpg", "height", "done", "have", "home", "portfolio", "alt", "web", "le",
    "img", "personal", "mark", "width", "1500", "styles", "picture", "resume",
    "showing", "welcome", "hi", "img_2315", "perera", "projects", "me", "src",
    "about", "name", "1800", "this", "contact", "my", "page"
}
Exemplo n.º 44
0
from crawler import crawler
from quiz.models import Player_info
from csvwriter import csvwriter


def main():
    """Run administrative tasks."""
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'myapi.settings')
    try:
        from django.core.management import execute_from_command_line
    except ImportError as exc:
        raise ImportError(
            "Couldn't import Django. Are you sure it's installed and "
            "available on your PYTHONPATH environment variable? Did you "
            "forget to activate a virtual environment?") from exc
    execute_from_command_line(sys.argv)


if __name__ == '__main__':
    main()
    data = crawler()
    for item in data:
        Player_info(number=item['number'],
                    name=item['name'],
                    position=item['position'],
                    age=item['age'],
                    nation=item['nation'],
                    team=item['team'],
                    value=item['value'],
                    photo=item['photo']).save()
    #csvwriter()
Exemplo n.º 45
0
import crawler

crawler.crawler(
    crawler.get_content('https://www.ptt.cc/bbs/WomenTalk/index.html'))
Exemplo n.º 46
0
        'unit': get_unit(food_info[0])
    }

    res = requests.post(url, json=food_json)

    if DEBUG_MOD:
        print('Post food ' + json.dumps(food_json))
    # return { 'result': 'fail', 'description': ''}


if __name__ == '__main__':
    with open(CATEGORY_BRANDS_LIST_FILE, 'r', encoding='utf8') as f:
        done = False
        while not done:
            data = f.readline().split()
            if data == []:
                done = True
                break

            category, brands = data[0], data[1:]

            category_id = get_category_id(category)
            for brand in brands:
                brand_id = get_brand_id(brand, category_id)
                brand_foods = crawler.crawler(brand)

                for food in brand_foods:
                    food_name, details = food_refine(food)
                    post_food(food_name, details, category_id, brand_id)
                    # To be update post_food fail exception -> logging
import pprint
from crawler import crawler
import urllib2
import urlparse
from bs4 import BeautifulSoup
from bs4 import Tag
from collections import defaultdict
import redis
from pagerank import page_rank
import re

# Testing File for Lab 3
# What I did in crawler was that I saved the page ranks by descending order,
# using the direct urls of the pages instead of the doc id
# -Marinette

if __name__ == "__main__":
    redisConnection = redis.Redis()
    bot = crawler(redisConnection, "urls.txt")
    bot.crawl(depth=1)
    print "Printing Page Ranks:"
    pprint.pprint(bot.crawler_page_ranks())
Exemplo n.º 48
0
from crawler import crawler
from pagerank import page_rank

# Get crawler object and crawl on urls found in urls.txt
crawler = crawler(None, 'urls.txt')
crawler.crawl()

document_index = crawler.get_document_index()

# Run pagerank on the links generated by the crawler
pagerank = page_rank(crawler._links)

for doc_id, rank in sorted(pagerank.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    document = crawler._document_index[doc_id]
    print str(rank) + " : " + str(document[0]) + "\n"
Exemplo n.º 49
0
import os
import sys
from crawler import crawler

print "Now starting the test...."
print
crawler = crawler(None, "urls.txt")
inverted_index = crawler.get_inverted_index()
print
print "inverted_index is......"
print
print inverted_index
print
print
resolved_inverted_index = crawler.get_resolved_inverted_index()
print "resolved_inverted_index is......"
print
print resolved_inverted_index
Exemplo n.º 50
0
import requests, pprint, json, urllib, os, functools, ssl, time
import pandas as pd
#from geopy import geocoders
from bs4 import BeautifulSoup
from copy import deepcopy
import ssl
import crawler

key = "AIzaSyASmGMElEZthlsMGEN-p3Nw1NInctWoXTk"
types = "restaurant"
radius = "1000"
fields = "name,formatted_address,rating"
inputString = "pizzeria"

crawl = crawler.crawler(types=types,
                        inputString=inputString,
                        radius=radius,
                        key=key)
dataOfInterest = {
    "name", "geometry", "place_id", "rating", "types", "vicinity", "reviews"
}
manual = """
Hello!
Press 1 for crawling cities. [There are 144 cities in Italy]
Press 2 for crawling communes. [TIME CONSUMING!!! There are 8100 communes in Italy]
Default: Exit!
"""
choices = [None, 'Cities', 'Communes']
option = input(manual)


def initiate(wid):
Exemplo n.º 51
0
import crawler

if __name__ == "__main__" :
    Clips = [] # 클립
    search_word = '방법 1화' # '방법 1화', '관찰카메라24 119화', '반의반 1화'
    vod_no = 1
    crawler.crawler(Clips, search_word, vod_no)

# //*[@id="player"]/div/div[1]/div[11]/div[13]/video/source
# blob 처리
Exemplo n.º 52
0
        archives_url = "http://localhost"
        if option_url:
            archives_url = option_url
        root = archives_url

        createStructure()
        depth = 1
        try:
            depth = int(option_crawler.strip().split()[0])
        except (ValueError, IndexError, AttributeError):
            depth = 0

        try:
            try:
                crawler(archives_url, depth)
            except IOError as e:
                print("Cannot open the url = %s" % archives_url)
                print(e.strerror)
                sys.exit(1)
            if len(database.keys()) < 1:
                print("No information found!")
                sys.exit(1)
            else:
                print(
                    "Starting investigation for the given URL...Please wait..."
                )

            if option_xss:
                investigate(archives_url)
Exemplo n.º 53
0
 def __init__(self):
     self.crawler = crawler()
     self.first_run = True
     self.date = datetime.datetime.now().date()
Exemplo n.º 54
0
    def scan(self):
        print "Requesting '%s'..." % (self.URL)

        extHeader = ""
        code, headers = self.doRequest(self.URL, self.config["p_useragent"],
                                       self.config["p_post"],
                                       self.config["header"],
                                       self.config["p_ttl"])

        if (headers != None):
            for head in headers:
                if head[0] in ("set-cookie", "set-cookie2"):
                    cookie = head[1]
                    c = Cookie.SimpleCookie()
                    c.load(cookie)
                    for k, v in c.items():
                        extHeader += "%s=%s; " % (k, c[k].value)

        if (code == None):
            print "Code == None!"
            print "Does the target exist?!"
            print "AutoAwesome mode failed. -> Aborting."
            sys.exit(1)

        if (extHeader != ""):
            print "Cookies retrieved. Using them for further requests."
            extHeader = extHeader.strip()[:-1]

        if (self.config["header"].has_key("Cookie") and extHeader != ""):
            print "WARNING: AutoAwesome mode got some cookies from the server."
            print "Your defined cookies will be overwritten!"

        if (extHeader != ""):
            print "Testing file inclusion against given cookies..."
            self.config["header"]["Cookie"] = extHeader
            single = singleScan(self.config)
            single.setURL(self.URL)
            single.setQuite(True)
            single.scan()

        soup = BeautifulSoup.BeautifulSoup(''.join(code))
        idx = 0
        for form in soup.findAll("form"):
            idx += 1
            caption = None
            desturl = None
            method = None

            if (soup.has_key("action")):
                desturl = soup["action"]
            else:
                desturl = self.URL

            if (form.has_key("name")):
                caption = form["name"]
            else:
                caption = "Unnamed Form #%d" % (idx)

            if (form.has_key("method")):
                if (form["method"].lower() == "get"):
                    method = 0
                else:
                    method = 1
            else:
                method = 1  # If no method is defined assume it's POST.

            params = ""
            for input in form.findAll("input"):
                if (input.has_key("name")):
                    input_name = input["name"]
                    input_val = None
                    if (input.has_key("value")):
                        input_val = input["value"]

                    if (input_val == None):
                        params += "%s=&" % (input_name)
                    else:
                        params += "%s=%s&" % (input_name, input_val)
                else:
                    print "An input field doesn't have an 'name' attribute! Skipping it."

            if ("&" in params):
                params = params[:-1]

            print "Analyzing form '%s' for file inclusion bugs." % (caption)
            modConfig = deepcopy(self.config)
            if (method == 0):
                # Append the current get params to the current URL.
                if ("?" in desturl):
                    # There are already params in the URL.
                    desturl = "%s&%s" % (desturl, params)
                else:
                    # There are no other params.
                    desturl = "%s&?%s" % (desturl, params)

            else:
                currentPost = modConfig["p_post"]
                if (currentPost == None or currentPost == ""):
                    currentPost = params
                else:
                    currentPost = currentPost + "&" + params

                modConfig["p_post"] = currentPost

            single = singleScan(modConfig)
            single.setURL(desturl)
            single.setQuite(True)
            single.scan()

        print "Starting harvester engine to get links (Depth: 0)..."
        crawl = crawler(self.config)
        crawl.crawl_url(self.URL, 0)
        if (len(crawl.urlpool) == 0):
            print "No links found."
        else:
            print "Harvesting done. %d links found. Analyzing links now..." % (
                len(crawl.urlpool))
            for url in crawl.urlpool:
                try:
                    single = singleScan(self.config)
                    single.setURL(str(url[0]))
                    single.setQuite(True)
                    single.scan()
                except:
                    print "Cought an exception. Continuing..."

        print "AutoAwesome is done."
Exemplo n.º 55
0
            m = massScan(config)
            m.startMassScan()
            show_report()

        elif (config["p_mode"] == 2):
            print("GoogleScanner is searching for Query: '%s'" %
                  config["p_query"])
            g = googleScan(config)
            g.startGoogleScan()
            show_report()

        elif (config["p_mode"] == 3):
            print(
                "Crawler is harvesting URLs from start URL: '%s' with depth: %d and writing results to: '%s'"
                % (config["p_url"], config["p_depth"], config["p_write"]))
            c = crawler(config)
            c.crawl()

        elif (config["p_mode"] == 4):
            print("AutoAwesome mode engaging URL '%s'..." % (config["p_url"]))
            awe = autoawesome.autoawesome(config)
            awe.setURL(config["p_url"])
            awe.scan()

        elif (config["p_mode"] == 5):
            print("BingScanner is searching for Query: '%s'" %
                  config["p_query"])
            b = bingScan(config)
            b.startGoogleScan()
            show_report()
Exemplo n.º 56
0
def main(title: str):
    title = str(title)
    fps = config['fps']
    result, audio_url = crawler.crawler(title)
    width = config['width']
    height = config['height']
    for key in result.keys():
        image_name = str(key)
        image_url = result[key]['image_url']
        image_dir = os.sep.join([".", "resource", title])
        crawler.save_image(image_url, image_dir, image_name)
    fourcc = VideoWriter_fourcc(*'mp4v')
    output_dir = os.sep.join(['.', 'output'])
    if not os.path.exists(output_dir):
        print("Folder", output_dir, 'does not exist. Creating...')
        os.makedirs(output_dir)
    video = VideoWriter(os.sep.join([output_dir,
                                     str(title) + '.mp4']), fourcc,
                        float(config['fps']),
                        (config['width'], config['height']))
    font = ImageFont.truetype(config['font'],
                              config['title_font_size'],
                              encoding="utf-8")
    font2 = ImageFont.truetype(config['font'],
                               config['content_font_size'],
                               encoding="utf-8")
    title_wrapper = text_processing.Wrapper(font)
    content_wrapper = text_processing.Wrapper(font2)
    keys = list(result.keys())
    keys.append(0)
    keys.sort()
    keys.append(keys[len(keys) - 1] + 10)
    print(keys)
    frame = image_processing.create_blank_frame("", "", (width, height),
                                                title_wrapper, content_wrapper,
                                                font, font2)
    total_length = keys[len(keys) - 1] * fps
    index = 0
    for i in range(total_length):
        if (index + 1 > len(keys) - 1):
            frame = image_processing.create_blank_frame(
                "", "", (width, height), title_wrapper, content_wrapper, font,
                font2)
        elif (i / fps) > keys[index + 1]:
            index += 1
            print(index, "out of", len(keys))
            key = keys[index]
            image = image = os.sep.join([
                '.', 'resource', title,
                str(key) +
                text_processing.find_image_suffix(result[key]['image_url'])
            ])
            header = result[key]['header']
            content = result[key]['content']
            print("标题:", header)
            if (result[key]['image_suffix'] in ['.gif', '.GIF']):
                frame = image_processing.create_blank_frame(
                    header, content, (width, height), title_wrapper,
                    content_wrapper, font, font2)
            else:
                frame = image_processing.create_frame(image, header, content,
                                                      (width, height),
                                                      title_wrapper,
                                                      content_wrapper, font,
                                                      font2)
                os.remove(image)
        else:
            ""
        video.write(frame)
    print(title, "finished!")
Exemplo n.º 57
0
# -*- coding: utf-8 -*-

import os
import sys

from config import init_file

from crawler import crawler
from initdb import initdb

if __name__ == '__main__':
    if not os.path.exists(init_file):
        print('Initializing database...')
        initdb()
    if len(sys.argv) > 1 and sys.argv[1] == '--clean':
        print('Initializing database...')
        initdb()
    while True:
        #try:
        crawler()
        #except Exception as e:
        #    print(e)
Exemplo n.º 58
0
import crawler
import searcher

pages = ['https://www.codechef.com/']

C = crawler.crawler('codechef.db')
C.createindextables()

print "Crawling :: \n"
C.crawl(pages)

print "Ranking Pages :: \n"
C.calculatepagerank()

S = searcher.searcher('codechef.db')

searchQuery = 'Saturday'
S.query(searchQuery)
Exemplo n.º 59
0
#! -*-coding:utf-8 -*-

# 作者:泽同学
# blog:www.orze.top
from proxy.text import *
import threading
from crawler.crawler import *
sys.path.append("/Users/wangzeqing/Desktop/python/玩玩/bilibili")
from db.dborder import *
lock = threading.Lock()
dbcon = mysqlconnect(lock)
p = threading.Thread(target=dbcon.print_list)
p.start()

print("+++++++++++++++++++++++++++++++++++\n开始验证proxy表中的数据")
#kong('proxy',dbcon)
print("+++++++++++++++++++++++++++++++++++\n开始爬虫进程")
crawler('proxy',dbcon)
print("+++++++++++++++++++++++++++++++++++\n开始测试temporary中的数据")
kong('temporary',dbcon)
print("+++++++++++++++++++++++++++++++++++\n开始验证recycle表中的数据")
kong('recycle',dbcon)





Exemplo n.º 60
0
def search():

    # Check cookies to see if we have previously saved searches
    url_cookie = request.cookies.get('urls')

    # Check cookies for errors
    url_error = request.cookies.get('url_error')
    keyword_error = request.cookies.get('keyword_error')

    # Use this delimiter for urls when they're saved as a string
    delimiter = ", "

    # Post handler - if the user has posted data from the form to this url:
    if request.method == 'POST':

        # Get variables from the form
        url = request.form['starting_url']
        method = request.form['method']
        depth = request.form['depth']
        keyword = request.form['keyword']

        # FIXME Make form object to send to crawler??
        form_data = {
            'starting_url': url,
            'method': method,
            'depth': depth,
            'keyword': keyword
        }

        # Validate url
        if url_validator(url):

            # Validate keyword
            if keyword_validator(keyword):

                # # FIXME Trace statements (DELETE)
                # print("Starting url: %s" %url)
                # print("Method: %s" %method)
                # print("Depth: %s" %depth)
                # print("Keyword: %s" %keyword)

                # Call crawler
                # crawler_thread = threading.Thread(target=crawl.crawler, args=form.data)
                # crawler_thread.start()
                # app.logger.info(form.data)
                # crawl.crawler(form.data)      # Call function to perform crawl using the Form submissions on the the search routes
                crawl.crawler(url, method, depth, keyword)

                # Use make_response to create response object so we can set cookies
                # Create response object that redirects to 'results' url
                response = make_response(redirect(url_for('results',
                                                          code=307)))

                # If url history cookie is already set, append the new url to the cookie string
                if url_cookie:
                    if url not in url_cookie:
                        # FIXME append url to cookie string with ", " delimiter
                        url_cookie += ", " + url
                        response.set_cookie('urls', url_cookie)

                # Else, if no 'urls' cookie yet, create 'urls' cookie and add new url
                else:
                    response.set_cookie('urls', url)

                # Set the cookie and redirect to the results page
                return response

            # Else if keyword is invalid, redirect back to search page and display keyword warning
            else:
                # Set error message to be displayed on search form
                keyword_error = "Invalid keyword submitted. Please enter a single word, letters only"

                # Flash the error message to session cookie and redirect back to page
                flash(keyword_error)
                return redirect(url_for('search'))

        # Else if url is not valid, redirect back to search page and display url error
        else:
            # Set error message to be displayed on search form
            url_error = "Invalid URL submitted. Please enter a valid URL"

            # Flash the error message to session cookie and redirect back to page
            flash(url_error)
            return redirect(url_for('search'))

    # Else if the user arrived via GET request from homepage, render the search form
    else:
        # Instantiate url_list, url_error, keyword_error to None
        url_list = None

        # Check for previously saved searches to save as list in url_list to be
        # used in dropdown input form
        if url_cookie:
            # Split into list to send to template
            url_list = url_cookie.split(delimiter)

        # Render the search form template with either a list of url's or nothing
        return render_template('search.html',
                               url_list=url_list,
                               url_error=url_error,
                               keyword_error=keyword_error)