Пример #1
0
 def test_remove_out_links(self):
     update_driver = queries.Driver()
     update_driver.update("1", "https://www.site-1.com",
                          ["https://www.site-2.com"])
     assert len(update_driver.get_outlinks("https://www.site-0.com")) == 1
     update_driver.update("1", "https://www.site-1.com", [])
     assert len(update_driver.get_outlinks("https://www.site-0.com")) == 0
Пример #2
0
 def test_unknown_url(self):
     rank_driver = queries.Driver()
     rank_driver.update("0", "https://www.site-0.com",
                        ["https://www.site-1.com"])
     rank_driver.run_pagerank()
     rankings = rank_driver.get_pagerank(["100"])
     assert len(rankings) == 1
     assert rankings["100"] == 0
Пример #3
0
 def test_efficiency(self):
     update_driver = queries.Driver()
     testData = generateTestData(1000)
     start = time.time()
     for docid in testData:
         update_driver.update(docid, testData[docid]["url"],
                              testData[docid]["out_links"])
     end = time.time()
     assert (end - start) < 15
Пример #4
0
 def test_efficiency(self):
     rank_driver = queries.Driver()
     testData = generateTestData(100)
     for docid in testData:
         rank_driver.update(docid, testData[docid]["url"],
                            testData[docid]["out_links"])
     start = time.time()
     rank_driver.run_pagerank()
     rankings = rank_driver.get_pagerank(testData.keys())
     end = time.time()
     assert (end - start) < 1
Пример #5
0
 def test_single(self):
     rank_driver = queries.Driver()
     rank_driver.update("0", "https://www.site-0.com", [])
     rank_driver.update("1", "https://www.site-1.com",
                        ["https://www.site-0.com"])
     rank_driver.update("2", "https://www.site-2.com",
                        ["https://www.site-0.com"])
     rank_driver.update("3", "https://www.site-3.com",
                        ["https://www.site-0.com"])
     rank_driver.run_pagerank()
     rankings = rank_driver.get_pagerank(["0"])
     assert len(rankings) == 1
     assert rankings["0"] > 0
Пример #6
0
import sys
from CrawlManager import CrawlManager
import time

app = Flask(__name__)

CRAWLING_ENDPOINTS = ["http://lspt-crawler1.cs.rpi.edu", "http://lspt-crawler3.cs.rpi.edu:3333"]
alternator = 0 #Flag to alternate between endpoints
MAX_LINKS = 10

crawl_links = ['http://rpi.edu',  'http://cs.rpi.edu', 'http://info.rpi.edu',
	'http://admissions.rpi.edu', 'http://rpiathletics.com' , 'https://research.rpi.edu',
	'https://news.rpi.edu', 'https://studentlife.rpi.edu', 'https://giving.rpi.edu', 
	'https://studenthealth.rpi.edu', 'https://sexualviolence.rpi.edu/', 'https://sll.rpi.edu/',
	'https://union.rpi.edu']
graph = queries.Driver() #Neo4j Graph Interface Initialization

#Initialize link manager
graph.add_initial_urls(crawl_links)
manager = CrawlManager(graph)
for link in crawl_links:
	manager.add(link)

'''
Alternate between crawling endpoints
@modifies alternator between 0 and 1
@return endpoint address
'''
def get_crawling_endpoint():
	global alternator
	alternator = not alternator