예제 #1
0
def cli(ctx, scraper_name, filename):
    authenticate()
    client = init_s3_credentials()

    # Generate bucket name to fetch logs from
    h = hashlib.md5()
    h.update(scraper_name.encode("utf-8"))
    bucket_name = scraper_name + "-" + h.hexdigest()

    # Attempt to find bucket for logs
    if not filename:
        response = client.list_buckets()
        if bucket_name not in [bucket["Name"] for bucket in response["Buckets"]]:
            ctx.log("The provided scraper has no logs!")
            return
        else:
            objects = client.list_objects(Bucket=bucket_name)["Contents"]
            parsed = [[i["Key"], i["LastModified"]] for i in objects]
            table = tabulate(parsed, headers=["File Name", "Last Modified"])
            print(table)
            return
    else:
        client.download_file(Bucket=bucket_name, Key=filename, Filename=filename)
        ctx.log("Downloaded file to: %s", click.format_filename(filename))
        return
def cli(ctx, country, number_urls):
    authenticate()
    ranked_link = db_get_collection(CRAWL_RANKED_COLLECTION)
    if not number_urls:
        number_urls = 3
    else:
        number_urls = int(number_urls)

    # Perform google search and start ranking results, get a set number of urls
    for url in search("ngo directory" + country,
                      lang="es",
                      num=number_urls,
                      stop=1):
        parsed_uri = urlparse(url)
        home_url = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
        print("Crawling --- ", home_url)
        if str(home_url) not in url_rank:
            # Check if url has already been ranked before
            cursor = ranked_link.find({"url": home_url})
            document_list = [url for url in cursor]
            if len(document_list) == 0:
                url_rank[home_url] = []
                print("Added url " + str(home_url))
            else:
                print("Already have information for " + home_url)
        rank_all(country)

    for url in url_rank:
        print("Inserted " + str(url) + "'s information to database")
        ranked_link.insert_one(url_rank[url])
예제 #3
0
def cli(ctx, name, url):
    """
    Registers the given scraper with the database. It basically just gets all
    possible routes from the /routes route then sets up appropriate inputs
    for gg.db.send_scraper_to_db().
    """
    authenticate()
    collection = db_get_collection()
    result = send_scraper_to_db(collection, name, url)
    ctx.log(result)
예제 #4
0
def cli(ctx):
    """
    GG list lists all scrapers registered on the database. It prints out each
    name and all routes associated with that scraper.
    """
    authenticate()
    collection = db_get_collection()
    ctx.log("Listing all registered scrapers!")
    all_docs = list_scrapers_from_db(collection)
    for doc in all_docs:
        ctx.log("  {}".format(doc[SCRAPER_COLL_NAME_FIELD]))
def cli(ctx):
    """
    GG fill-ids enriches the database by inserting the site of the registration office for that specific country
    """
    authenticate()
    # Specify collection to perform operations to
    collection = db_get_collection(NGO_COLLECTION)
    ctx.log(
        "Finding and inserting registration office sites into the database...")

    # Get all ngos from the database that don't have a registration ID/site and have a country
    # to get registration office site
    ngo_list = list_ngos_from_db(collection,
                                 registration=None,
                                 country={"$ne": None})
    updated_list = []

    # Keep track of all countries seen so far to reduce amount of scraping
    prev_countries = dict()

    # Update the document to have a registration site, check edge case of no country, which can't be assigned
    # a registration site
    for org in ngo_list:
        # Check if country has been previously scraped already
        if org[COUNTRY_FIELD].title() in prev_countries.keys():
            org[REGISTRATION_FIELD] = [
                prev_countries[org[COUNTRY_FIELD]]
            ]  # Creates a list to easily add more registration IDs/fields
            updated_list.append(org)
        else:
            # Scrape for country code and add to dictionary
            registration_url = get_registration_site(org[COUNTRY_FIELD])
            prev_countries[org[COUNTRY_FIELD].title()] = registration_url
            org[REGISTRATION_FIELD] = [registration_url]
            updated_list.append(org)

    # Check list of updated NGOs and only delete/insert NGOs that now have registration office sites
    for updated_org in updated_list:
        if updated_org[REGISTRATION_FIELD][0] != "":
            delete_one_ngo_from_db(collection,
                                   _id=ObjectId(updated_org["_id"]))
        else:
            updated_list.remove(updated_org)
    # Push updated documents to database
    ctx.log(upload_data(collection, {"data": updated_list}))
예제 #6
0
def cli(ctx, n):
    authenticate()
    collection = db_get_collection()
    search = "Finding scraper {} from list of registered scrapers..."
    ctx.log(search.format(n))
    try:
        # Test scraper by sending request to test endpoint
        scrapers = list_scrapers_from_db(collection)
        scraper = list(
            filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == n,
                   scrapers))
        route = scraper[0]["_id"] + "/test"
        ctx.log("Scraper {} found!".format(n))
    except Exception:
        ctx.log("Scraper {} not found.".format(n))
        return
    contents = requests.get(route).text
    print(contents)
def cli(ctx):
    """
    Submits all NGO data stored in the MLab database to the GlobalGiving API.
    TEMPORARY STUB: Currently, the data is just stored in
    `~/globalgiving/ngo_data.json`
    """
    authenticate()
    collection = db_get_collection(NGO_COLLECTION)
    dotenv.load_dotenv(dotenv.find_dotenv())

    # get all NGOs
    ngo_list = list_ngos_from_db(collection)

    # just write to a file for now
    if not os.path.exists(os.getenv("HOME") + CLI_DIR_NAME):
        os.makedirs(os.getenv("HOME") + CLI_DIR_NAME)
    with open(os.getenv("HOME") + CLI_DIR_NAME + "ngo_data.json", "w+") as f:
        f.write(json.dumps(ngo_list, indent=4, separators=(",", ": ")))
    ctx.log("NGO data was successfully submitted!")
def cli(ctx):
    authenticate()
    ranked_link = db_get_collection(CRAWL_RANKED_COLLECTION)
    cursor = ranked_link.find({})
    directories = [_ for _ in cursor]

    ranked_ngo_directories = []
    for directory in directories:
        ranked_ngo_directories += [(directory["url"], directory)]

    print("Ranked Set of NGO's gathered")
    for ngo_directory in ranked_ngo_directories:
        print("   " + str(ngo_directory[0]))
        rank_info = ngo_directory[1]
        print("         Has " + str(rank_info["num_phone_numbers"]) +
              " phone numbers")
        print("         Has " + str(rank_info["num_addresses"]) + " addresses")
        print("         Has " + str(rank_info["num_subpages"]) + " subpages")
        print("         Has " + str(rank_info["num_word_ngo"]) +
              " appearances of ngo directory related words")
def cli(ctx, name):
    authenticate()
    collection = db_get_collection()
    search = "Finding scraper {} from list of registered scrapers . . . "
    ctx.log(search.format(name))
    try:
        scrapers = list_scrapers_from_db(collection)
        ngo_id = list(
            filter(
                lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(name), scrapers
            )
        )[0]["_id"]
        ctx.log("Scraper {} found!".format(name))
    except StopIteration:
        ctx.log("Scraper {} not found.".format(name))
        return
    except IndexError:
        ctx.log("Scraper {} not found.".format(name))
        return
    ctx.log("Deleting scraper {} . . . ".format(name))
    return delete_scraper(collection, ngo_id)
예제 #10
0
def cli(ctx, scraper_name):
    authenticate()
    collection = db_get_collection()
    scrapers = list_scrapers_from_db(collection)

    # If you have a scraper name, find the url for that scraper
    if scraper_name:
        for scraper in scrapers:
            if scraper[SCRAPER_COLL_NAME_FIELD] == scraper_name:
                url = requests.get(scraper["_id"] + "/url").text
                if "http" in url:
                    print(url)
                    return

    # Otherwise list the names of all sites being scraped
    else:
        for scraper in scrapers:
            print("Scraper: " + scraper[SCRAPER_COLL_NAME_FIELD])
            contents = requests.get(scraper["_id"] + "/url").text
            if "http" in contents:
                print("       " + contents)
            else:
                print("       ERROR: Scraper not available")
예제 #11
0
def cli(ctx, n, a):
    authenticate()
    collection = db_get_collection()
    ngo_collection = db_get_collection(NGO_COLLECTION)
    client = init_s3_credentials()

    if a:
        run_all(ctx)
        return

    # Create new bucket name for log file by using hash
    h = hashlib.md5()
    h.update(n.encode("utf-8"))
    bucket_name = n + "-" + h.hexdigest()

    # Generate unique file name for new log
    filename = str(uuid.uuid4()) + ".txt"
    f = open(filename, "w+")

    search = "Finding scraper {} from list of registered scrapers..."
    f.write(search.format(n) + "\n")
    try:
        scrapers = list_scrapers_from_db(collection)
        route_data = list(
            filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(n), scrapers)
        )
        if len(route_data) == 0:
            print("Scraper not found")
            return
        route_data = route_data[0]
        route = route_data["_id"] + "/data"
        f.write("Scraper {} found!".format(n) + "\n")
    except StopIteration:
        f.write("Scraper {} not found!".format(n) + "\n")
        f.close()
        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        return
    try:
        # Run scraper by getting the correct route and requesting it
        contents = requests.get(route).json()
        if "data" in contents:
            print("The data is uploaded")
            f.write(upload_data(ngo_collection, contents))
        elif "pages" in contents:
            print("Fetching all " + str(contents["pages"]) + " pages")
            f.write("Fetching all " + str(contents["pages"]) + " pages")
            for i in range(int(contents["pages"])):
                try:
                    url = str(route_data["_id"]) + "page"
                    # TODO: Current hack-run the scraper on localhost, and it will get all the pages, there is a deployment bug at the moment
                    # To run in locally, navigate to the directory for the scraper and run "python3 index.py"
                    url = "http://localhost:5000/page"
                    print("Fetching " + url)
                    f.write("Fetching " + url)
                    # contents = requests.get(route).json()
                    payload = {"url": str(contents["urls"][i])}
                    print(payload)
                    data = requests.post(url, json=json.dumps(payload))
                    print(data.json())
                    f.write(upload_data(ngo_collection, data.json()))
                    print("The data is uploaded")
                except Exception as e:
                    print(e)
                    f.write("Failed on page" + str(route))
                    continue
        else:
            print("The data recieved is not structured correctly")
            f.write("The data recieved is not structured correctly")
    except Exception as e:
        print("exception")
        print(e)
        contents = str(e) + "\nFAILED"
        f.write(contents)

    f.close()

    # Call S3 to list current buckets
    response = client.list_buckets()

    # Get a list of all bucket names from the response
    buckets = [bucket["Name"] for bucket in response["Buckets"]]

    if bucket_name not in buckets:
        client.create_bucket(Bucket=bucket_name)

    client.upload_file(filename, bucket_name, filename)

    os.remove(filename)
    ctx.log("Wrote logs to file: " + filename)
예제 #12
0
def run_all(ctx):
    authenticate()
    collection = db_get_collection()
    ngo_collection = db_get_collection(NGO_COLLECTION)
    client = init_s3_credentials()

    h = hashlib.md5()
    names = []
    routes = []
    log_files = []
    log_filenames = []

    try:
        scrapers = list_scrapers_from_db(collection)
        for scraper in scrapers:
            n = scraper[SCRAPER_COLL_NAME_FIELD]
            names.append(n)
            routes.append(scraper["_id"] + "/data")
            h.update(n.encode("utf-8"))
            bucket_name = n + "-" + h.hexdigest()
            filename = str(uuid.uuid4()) + ".txt"
            log_filenames.append(filename)
            log_files.append(open(filename, "w+"))
    except Exception as e:
        n = "all"  # the name in this case is effectively all, then we can just
        # use the code from the single case
        h.update(n.encode("utf-8"))
        bucket_name = n + "-" + h.hexdigest()
        filename = str(uuid.uuid4()) + ".txt"
        f = open(filename, "w+")
        f.write("Scraper not found!" + "\n")
        f.write(str(e) + "\nFAILED")
        f.close()
        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        ctx.log("Failed. See log at {} in bucket {}.".format(filename, bucket_name))
        return

    # Call S3 to list current buckets to prepare for logging
    response = client.list_buckets()
    # Get a list of all bucket names from the response
    buckets = [bucket["Name"] for bucket in response["Buckets"]]

    for name, route, log, filename in zip(names, routes, log_files, log_filenames):
        try:
            ctx.log("Getting information for {} . . . ".format(name))
            contents = requests.get(route).json()
            if "data" in contents:
                log.write(contents)
                log.write(upload_data(ngo_collection, contents))
                log.write("Upload succeeded!")
                ctx.log("Uploading {} succeeded!".format(name))
            else:
                ctx.log("Skipping this scraper")
        except Exception as e:
            log.write("Upload failed.")
            ctx.log("Uploading {} failed.".format(name))
        finally:
            log.close()

        if bucket_name not in buckets:
            client.create_bucket(Bucket=bucket_name)

        client.upload_file(filename, bucket_name, filename)
        os.remove(filename)
        ctx.log("Wrote logs for {} to file: ".format(name) + filename)