示例#1
0
def dev_list(collection):
    """
    Helper method that gets called when testing the command using a mocked collection.

    Input:
        collection: collection to perform operations with/on
    """
    all_docs = list_scrapers_from_db(collection)
    return all_docs
示例#2
0
def dev_testscraper(collection, name):
    try:
        scrapers = list_scrapers_from_db(collection)
        scraper = list(
            filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == n,
                   scrapers))
        route = scraper[0]["_id"] + "/test"
    except Exception:
        return
    return requests.get(route).text
示例#3
0
def cli(ctx):
    """
    GG list lists all scrapers registered on the database. It prints out each
    name and all routes associated with that scraper.
    """
    authenticate()
    collection = db_get_collection()
    ctx.log("Listing all registered scrapers!")
    all_docs = list_scrapers_from_db(collection)
    for doc in all_docs:
        ctx.log("  {}".format(doc[SCRAPER_COLL_NAME_FIELD]))
示例#4
0
def test_existence():
    # Mock collection and test data
    mock_collection = mongomock.MongoClient().db.collection
    name = "test"
    url = "url1"

    send_scraper_to_db(mock_collection, name, url, test=True)
    assert mock_collection.count_documents({}) == 1

    docs = list_scrapers_from_db(mock_collection)
    assert docs[0]["name"] == "test"
    assert docs[0]["_id"] == "url1"
示例#5
0
def test_update():
    # Mock collection and test data
    mock_collection = mongomock.MongoClient().db.collection
    mock_collection.insert_one(dict(_id="url1", name="test"))
    name = "test"
    url = "url1"

    status = send_scraper_to_db(mock_collection, name, url, test=True)
    assert status == "Registration sent to db with id: url1"
    assert mock_collection.count_documents({}) == 1

    docs = list_scrapers_from_db(mock_collection)
    assert docs[0]["name"] == "test"
    assert docs[0]["_id"] == "url1"
示例#6
0
def cli(ctx, n):
    authenticate()
    collection = db_get_collection()
    search = "Finding scraper {} from list of registered scrapers..."
    ctx.log(search.format(n))
    try:
        # Test scraper by sending request to test endpoint
        scrapers = list_scrapers_from_db(collection)
        scraper = list(
            filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == n,
                   scrapers))
        route = scraper[0]["_id"] + "/test"
        ctx.log("Scraper {} found!".format(n))
    except Exception:
        ctx.log("Scraper {} not found.".format(n))
        return
    contents = requests.get(route).text
    print(contents)
def dev_delete(collection, name):
    """
    Helper method that gets called when testing the command using a mocked collection.

    Input:
        collection: collection to perform operations with/on
        name: scraper to be deleted
    """
    try:
        scrapers = list_scrapers_from_db(collection)
        ngo_id = list(
            filter(
                lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(name), scrapers
            )
        )[0]["_id"]
    except StopIteration:
        return
    except IndexError:
        return
    return delete_scraper(collection, ngo_id)
def cli(ctx, name):
    authenticate()
    collection = db_get_collection()
    search = "Finding scraper {} from list of registered scrapers . . . "
    ctx.log(search.format(name))
    try:
        scrapers = list_scrapers_from_db(collection)
        ngo_id = list(
            filter(
                lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(name), scrapers
            )
        )[0]["_id"]
        ctx.log("Scraper {} found!".format(name))
    except StopIteration:
        ctx.log("Scraper {} not found.".format(name))
        return
    except IndexError:
        ctx.log("Scraper {} not found.".format(name))
        return
    ctx.log("Deleting scraper {} . . . ".format(name))
    return delete_scraper(collection, ngo_id)
示例#9
0
def cli(ctx, scraper_name):
    authenticate()
    collection = db_get_collection()
    scrapers = list_scrapers_from_db(collection)

    # If you have a scraper name, find the url for that scraper
    if scraper_name:
        for scraper in scrapers:
            if scraper[SCRAPER_COLL_NAME_FIELD] == scraper_name:
                url = requests.get(scraper["_id"] + "/url").text
                if "http" in url:
                    print(url)
                    return

    # Otherwise list the names of all sites being scraped
    else:
        for scraper in scrapers:
            print("Scraper: " + scraper[SCRAPER_COLL_NAME_FIELD])
            contents = requests.get(scraper["_id"] + "/url").text
            if "http" in contents:
                print("       " + contents)
            else:
                print("       ERROR: Scraper not available")
def cli(ctx, n, a):
    authenticate()
    collection = db_get_collection()
    ngo_collection = db_get_collection(NGO_COLLECTION)
    client = init_s3_credentials()

    if a:
        run_all(ctx)
        return

    # Create new bucket name for log file by using hash
    h = hashlib.md5()
    h.update(n.encode("utf-8"))
    bucket_name = n + "-" + h.hexdigest()

    # Generate unique file name for new log
    filename = str(uuid.uuid4()) + ".txt"
    f = open(filename, "w+")

    search = "Finding scraper {} from list of registered scrapers..."
    f.write(search.format(n) + "\n")
    try:
        scrapers = list_scrapers_from_db(collection)
        route_data = list(
            filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(n), scrapers)
        )
        if len(route_data) == 0:
            print("Scraper not found")
            return
        route_data = route_data[0]
        route = route_data["_id"] + "/data"
        f.write("Scraper {} found!".format(n) + "\n")
    except StopIteration:
        f.write("Scraper {} not found!".format(n) + "\n")
        f.close()
        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        return
    try:
        # Run scraper by getting the correct route and requesting it
        contents = requests.get(route).json()
        if "data" in contents:
            print("The data is uploaded")
            f.write(upload_data(ngo_collection, contents))
        elif "pages" in contents:
            print("Fetching all " + str(contents["pages"]) + " pages")
            f.write("Fetching all " + str(contents["pages"]) + " pages")
            for i in range(int(contents["pages"])):
                try:
                    url = str(route_data["_id"]) + "page"
                    # TODO: Current hack-run the scraper on localhost, and it will get all the pages, there is a deployment bug at the moment
                    # To run in locally, navigate to the directory for the scraper and run "python3 index.py"
                    url = "http://localhost:5000/page"
                    print("Fetching " + url)
                    f.write("Fetching " + url)
                    # contents = requests.get(route).json()
                    payload = {"url": str(contents["urls"][i])}
                    print(payload)
                    data = requests.post(url, json=json.dumps(payload))
                    print(data.json())
                    f.write(upload_data(ngo_collection, data.json()))
                    print("The data is uploaded")
                except Exception as e:
                    print(e)
                    f.write("Failed on page" + str(route))
                    continue
        else:
            print("The data recieved is not structured correctly")
            f.write("The data recieved is not structured correctly")
    except Exception as e:
        print("exception")
        print(e)
        contents = str(e) + "\nFAILED"
        f.write(contents)

    f.close()

    # Call S3 to list current buckets
    response = client.list_buckets()

    # Get a list of all bucket names from the response
    buckets = [bucket["Name"] for bucket in response["Buckets"]]

    if bucket_name not in buckets:
        client.create_bucket(Bucket=bucket_name)

    client.upload_file(filename, bucket_name, filename)

    os.remove(filename)
    ctx.log("Wrote logs to file: " + filename)
def run_all(ctx):
    authenticate()
    collection = db_get_collection()
    ngo_collection = db_get_collection(NGO_COLLECTION)
    client = init_s3_credentials()

    h = hashlib.md5()
    names = []
    routes = []
    log_files = []
    log_filenames = []

    try:
        scrapers = list_scrapers_from_db(collection)
        for scraper in scrapers:
            n = scraper[SCRAPER_COLL_NAME_FIELD]
            names.append(n)
            routes.append(scraper["_id"] + "/data")
            h.update(n.encode("utf-8"))
            bucket_name = n + "-" + h.hexdigest()
            filename = str(uuid.uuid4()) + ".txt"
            log_filenames.append(filename)
            log_files.append(open(filename, "w+"))
    except Exception as e:
        n = "all"  # the name in this case is effectively all, then we can just
        # use the code from the single case
        h.update(n.encode("utf-8"))
        bucket_name = n + "-" + h.hexdigest()
        filename = str(uuid.uuid4()) + ".txt"
        f = open(filename, "w+")
        f.write("Scraper not found!" + "\n")
        f.write(str(e) + "\nFAILED")
        f.close()
        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        ctx.log("Failed. See log at {} in bucket {}.".format(filename, bucket_name))
        return

    # Call S3 to list current buckets to prepare for logging
    response = client.list_buckets()
    # Get a list of all bucket names from the response
    buckets = [bucket["Name"] for bucket in response["Buckets"]]

    for name, route, log, filename in zip(names, routes, log_files, log_filenames):
        try:
            ctx.log("Getting information for {} . . . ".format(name))
            contents = requests.get(route).json()
            if "data" in contents:
                log.write(contents)
                log.write(upload_data(ngo_collection, contents))
                log.write("Upload succeeded!")
                ctx.log("Uploading {} succeeded!".format(name))
            else:
                ctx.log("Skipping this scraper")
        except Exception as e:
            log.write("Upload failed.")
            ctx.log("Uploading {} failed.".format(name))
        finally:
            log.close()

        if bucket_name not in buckets:
            client.create_bucket(Bucket=bucket_name)

        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        ctx.log("Wrote logs for {} to file: ".format(name) + filename)