def test_single_page_crawl(): c = Crawler("http://localhost:5000/test_one",1) print "initialized crawler.." c.crawl() print "crawled website.." assert c.urls == ["http://localhost:5000/test_one"] assert c.data == ["Hello World!"]
def test_single_page_with_complex_content(): c = Crawler("http://hackingagainstslavery.github.io",1) print "initalized crawler.." c.crawl() print "crawled website.." assert c.urls == ['https://github.com/hackingagainstslavery', 'http://hackingagainstslavery.slack.com', 'http://hackingagainstslavery.github.io'] print c.data assert c.data == ["Hacking Against Slavery HomeAboutBlog Welcome to Hacking Against Slavery! An informal organization for ending slavery Sign up for our slack channel: http://hackingagainstslavery.slack.com by emailing us at [email protected] out our github: https://github.com/hackingagainstslaverySeperately our voices are weak. Together our voices are strong. It's up to all of us to fight the evil in the world. And it starts by ensuring freedom for all. email github.com/hackingagainstslavery"]
def main(website,depth): website = "https://" + website c = Crawler(website,int(depth)) c.crawl() c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],os.environ["drawer_handle"],os.environ["search_secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])
def main(website, depth, production=True): staging = pickle.load(open("i14y_creds.pickle", "r")) website = "https://" + website if production: c = Crawler(website, int(depth)) else: c = Crawler(website, int(depth), username=os.environ["staging_username"], password=os.environ["staging_password"], basic_auth_required=True) c.crawl() c.save_to_json() index = json.load(open('index.json', 'r')) #ToDo: Create a staging drawer and request a second search token for ind, elem in enumerate(index): i14yClient.create(ind, elem['content'], elem['url'], elem['created'], staging[0], staging[1], title=elem['title'], description=elem['description'], promote=elem['promote'], language=elem['language'])
def test_uniqueify(): c = Crawler("https://www.vets.gov", 2) c.crawl() c.uniqueify() c.save_to_json() index = json.load(open('index.json', 'r')) urls = [] unique_urls = [] for ind, elem in enumerate(index): urls.append(elem['url']) unique_urls = list(set(urls)) urls = [str(url) for url in urls] unique_urls = [str(url) for url in unique_urls] urls.sort() unique_urls.sort() print print "unique urls", len(unique_urls) print unique_urls print print print "urls", len(urls) print urls assert urls == unique_urls
from ingestion.engine import Crawler c = Crawler("http://127.0.0.1:5000",2,testing=True,protocol="http") c.crawl() print c.data print c.urls
from ingestion.engine import Crawler from api.clients import i14yClient from sys import argv import pickle import json website_creds = pickle.load(open("website_creds.pickle","r")) backend_creds = pickle.load(open("backend_creds.pickle","r")) c = Crawler(argv[1],int(argv[2])) #,username=website_creds["username"],password=website_creds["password"],basic_auth_required=True) c.crawl() #print c.data #print c.urls c.save_to_json() index = json.load(open('index.json','r')) for ind,elem in enumerate(index): i14yClient.create(ind,elem['content'],elem['path'], elem['created'],backend_creds["drawer_handle"],backend_creds["secret_token"], title=elem['title'],description=elem['description'], promote=elem['promote'],language=elem['language'])
from ingestion.engine import Crawler c = Crawler("http://127.0.0.1:5000", 2, testing=True, protocol="http") c.crawl() print c.data print c.urls