Пример #1
0
def main(website, depth, production=True):
    staging = pickle.load(open("i14y_creds.pickle", "r"))
    website = "https://" + website
    if production:
        c = Crawler(website, int(depth))
    else:
        c = Crawler(website,
                    int(depth),
                    username=os.environ["staging_username"],
                    password=os.environ["staging_password"],
                    basic_auth_required=True)
    c.crawl()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    #ToDo: Create a staging drawer and request a second search token
    for ind, elem in enumerate(index):
        i14yClient.create(ind,
                          elem['content'],
                          elem['url'],
                          elem['created'],
                          staging[0],
                          staging[1],
                          title=elem['title'],
                          description=elem['description'],
                          promote=elem['promote'],
                          language=elem['language'])
Пример #2
0
def main(website,depth):
    website = "https://" + website
    c = Crawler(website,int(depth))
    c.crawl()
    c.save_to_json()
    index = json.load(open('index.json','r'))
    for ind,elem in enumerate(index):
        i14yClient.create(ind,elem['content'],elem['path'],
                          elem['created'],os.environ["drawer_handle"],os.environ["search_secret_token"],
                          title=elem['title'],description=elem['description'],
                          promote=elem['promote'],language=elem['language']) 
Пример #3
0
def test_uniqueify():
    c = Crawler("https://www.vets.gov", 2)
    c.crawl()
    c.uniqueify()
    c.save_to_json()
    index = json.load(open('index.json', 'r'))
    urls = []
    unique_urls = []
    for ind, elem in enumerate(index):
        urls.append(elem['url'])
    unique_urls = list(set(urls))
    urls = [str(url) for url in urls]
    unique_urls = [str(url) for url in unique_urls]
    urls.sort()
    unique_urls.sort()
    print
    print "unique urls", len(unique_urls)
    print unique_urls
    print
    print
    print "urls", len(urls)
    print urls

    assert urls == unique_urls
Пример #4
0
from ingestion.engine import Crawler
from api.clients import i14yClient
from sys import argv
import pickle
import json

website_creds = pickle.load(open("website_creds.pickle","r"))
backend_creds = pickle.load(open("backend_creds.pickle","r"))
c = Crawler(argv[1],int(argv[2])) #,username=website_creds["username"],password=website_creds["password"],basic_auth_required=True)
c.crawl()
#print c.data
#print c.urls
c.save_to_json()
index = json.load(open('index.json','r'))
for ind,elem in enumerate(index):
    i14yClient.create(ind,elem['content'],elem['path'],
                      elem['created'],backend_creds["drawer_handle"],backend_creds["secret_token"],
                      title=elem['title'],description=elem['description'],
                      promote=elem['promote'],language=elem['language'])