Exemplo n.º 1
0
def start_broad_crawl_job(workspace_id, num_to_fetch, broadness,
                          broad_crawler_provider, broad_crawler_sources,
                          crawl_type):

    #check there is trained data
    categorized_urls = get_seeds_urls_categorized(workspace_id)

    if 'relevant' not in categorized_urls or len(
            categorized_urls['relevant']) == 0:
        raise InvalidUsage("No trained URLS!", status_code=409)

    job_id = save_job(workspace_id,
                      num_to_fetch=int(num_to_fetch),
                      broad_crawler_provider=broad_crawler_provider,
                      broad_crawler_sources=broad_crawler_sources,
                      crawl_type=crawl_type)

    job_id = str(job_id)
    queue_broad_crawl(workspace_id,
                      job_id=job_id,
                      num_to_fetch=int(num_to_fetch),
                      broadness=broadness,
                      broad_crawler_provider=broad_crawler_provider,
                      broad_crawler_sources=broad_crawler_sources)

    return job_id
Exemplo n.º 2
0
def broad_crawl_publication_api(workspace_id):
    try:
        num_to_fetch = request.json['nResults']
        broad_crawler_provider = request.json['crawlProvider']
        broad_crawler_sources = request.json['crawlSources']

        logging.info("Going to fetch %s urls with broad crawl" %
                     str(num_to_fetch))
        job_id = start_broad_crawl_job(
            workspace_id,
            num_to_fetch=int(num_to_fetch),
            broad_crawler_provider=broad_crawler_provider,
            broad_crawler_sources=broad_crawler_sources,
            crawl_type="BROADCRAWL")

        if job_id is None:
            return Response(json.dumps(
                "{errorMessage: No keywords provided, error: 2002}"),
                            mimetype="application/json")

        res = {}
        res['jobId'] = job_id
        return Response(json.dumps({"jobId": job_id}),
                        mimetype="application/json")

    except NameError, e:
        raise InvalidUsage(str(e), status_code=409)
Exemplo n.º 3
0
def add_workspace_api():
    try:
        name = request.data
        add_workspace(name)
        in_doc = list_workspace()
        out_doc = JSONEncoder().encode(in_doc)
        return Response(out_doc, mimetype="application/json")
    except AddingWorkspaceError:
        raise InvalidUsage('A workspace with that name already exists',
                           status_code=409)
Exemplo n.º 4
0
def edit_api(id):

    roles = None
    active = None
    if 'roles' in request.json:
        roles = request.json['roles']

    if 'active' in request.json:
        active = bool(request.json['active'])

    if active is None and roles is None:
        raise InvalidUsage("no update provided", status_code=409)

    update_user(id, active, roles)
    return Response("{}", mimetype="application/json")
Exemplo n.º 5
0
def start_smart_crawl_job(workspace_id, num_to_fetch, broadness):
    urls = __get_urls(workspace_id)
    if len(urls) == 0:
        raise InvalidUsage("No trained URLS!", status_code=409)

    job_id = save_smart_crawl_job(workspace_id,
                                  num_to_fetch=int(num_to_fetch),
                                  broadness=broadness)
    page_model = __get_page_model(workspace_id)
    __queue_smart_crawl_start(workspace_id,
                              job_id=job_id,
                              page_limit=int(num_to_fetch),
                              broadness=broadness,
                              urls=urls,
                              page_model=page_model)
    return job_id
Exemplo n.º 6
0
def create_account(username):
    password = request.json['password']
    encrypted_password = utils.encrypt_password(password)

    try:
        Singleton.getInstance().user_datastore.create_user(
            email=username,
            password=encrypted_password,
            roles=[],
            active=True,
            login_count=0)
    except NotUniqueError as ex:
        raise InvalidUsage('An username with that email already exists',
                           status_code=409)

    return Response("{}", mimetype="application/json")
Exemplo n.º 7
0
def schedule_spider_searchengine_api(workspace_id):

    num_to_fetch = request.json['nResults']
    broad_crawler_provider = request.json['crawlProvider']
    broad_crawler_sources = request.json['crawlSources']

    try:
        job_id = schedule_spider_searchengine(
            workspace_id,
            num_to_fetch=int(num_to_fetch),
            broad_crawler_provider=broad_crawler_provider,
            broad_crawler_sources=broad_crawler_sources)
        return Response('{"jobId": "' + job_id + '"}',
                        mimetype="application/json")
    except NameError, e:
        raise InvalidUsage(str(e), status_code=409)
Exemplo n.º 8
0
def smart_crawl_publication_api(workspace_id):
    try:
        num_to_fetch = request.json['nResults']
        broadness = request.json['broadness']
        job_id = start_smart_crawl_job(workspace_id,
                                       num_to_fetch=int(num_to_fetch),
                                       broadness=broadness)

        if job_id is None:
            return Response(
                json.dumps("{errorMessage: Job Failed to start, error: 2002}"),
                mimetype="application/json")

        res = {}
        res['jobId'] = job_id
        return Response(json.dumps({"jobId": job_id}),
                        mimetype="application/json")

    except NameError, e:
        raise InvalidUsage(str(e), status_code=409)
def start_deep_crawl_job(workspace_id, num_to_fetch, selection):
    broad_crawler_provider = "HH-JOOGLE"
    crawl_type = "DEEPCRAWL"

    broad_crawler_sources = []
    for key, value in selection.iteritems():
        if value["allSelected"] or len(value["selected"]) > 0:
            broad_crawler_sources.append(key)

    urls = __get_seeds_url_by_selection(workspace_id, selection)
    domains = extract_domains_from_urls(urls)

    if len(urls) == 0:
        raise InvalidUsage("No Seed URLS were selected!", status_code=409)

    job_id = save_job(workspace_id,
                      num_to_fetch=int(num_to_fetch),
                      crawler_provider=broad_crawler_provider,
                      crawler_sources=broad_crawler_sources,
                      crawl_type=crawl_type,
                      status="STARTED")

    login_credentials = get_successful_logins(workspace_id, domains)
    for doc in login_credentials:
        if "keyValues" in doc:
            doc["key_values"] = doc["keyValues"]
            doc.pop('keyValues', None)
            doc["id"] = doc["_id"]
            doc.pop('_id', None)

    queue_deep_crawl_start(workspace_id,
                           job_id=job_id,
                           num_to_fetch=num_to_fetch,
                           urls=urls,
                           login_credentials=login_credentials)
    return job_id