def cli(ctx, scraper_name, filename): authenticate() client = init_s3_credentials() # Generate bucket name to fetch logs from h = hashlib.md5() h.update(scraper_name.encode("utf-8")) bucket_name = scraper_name + "-" + h.hexdigest() # Attempt to find bucket for logs if not filename: response = client.list_buckets() if bucket_name not in [bucket["Name"] for bucket in response["Buckets"]]: ctx.log("The provided scraper has no logs!") return else: objects = client.list_objects(Bucket=bucket_name)["Contents"] parsed = [[i["Key"], i["LastModified"]] for i in objects] table = tabulate(parsed, headers=["File Name", "Last Modified"]) print(table) return else: client.download_file(Bucket=bucket_name, Key=filename, Filename=filename) ctx.log("Downloaded file to: %s", click.format_filename(filename)) return
def cli(ctx, country, number_urls): authenticate() ranked_link = db_get_collection(CRAWL_RANKED_COLLECTION) if not number_urls: number_urls = 3 else: number_urls = int(number_urls) # Perform google search and start ranking results, get a set number of urls for url in search("ngo directory" + country, lang="es", num=number_urls, stop=1): parsed_uri = urlparse(url) home_url = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri) print("Crawling --- ", home_url) if str(home_url) not in url_rank: # Check if url has already been ranked before cursor = ranked_link.find({"url": home_url}) document_list = [url for url in cursor] if len(document_list) == 0: url_rank[home_url] = [] print("Added url " + str(home_url)) else: print("Already have information for " + home_url) rank_all(country) for url in url_rank: print("Inserted " + str(url) + "'s information to database") ranked_link.insert_one(url_rank[url])
def cli(ctx, name, url): """ Registers the given scraper with the database. It basically just gets all possible routes from the /routes route then sets up appropriate inputs for gg.db.send_scraper_to_db(). """ authenticate() collection = db_get_collection() result = send_scraper_to_db(collection, name, url) ctx.log(result)
def cli(ctx): """ GG list lists all scrapers registered on the database. It prints out each name and all routes associated with that scraper. """ authenticate() collection = db_get_collection() ctx.log("Listing all registered scrapers!") all_docs = list_scrapers_from_db(collection) for doc in all_docs: ctx.log(" {}".format(doc[SCRAPER_COLL_NAME_FIELD]))
def cli(ctx): """ GG fill-ids enriches the database by inserting the site of the registration office for that specific country """ authenticate() # Specify collection to perform operations to collection = db_get_collection(NGO_COLLECTION) ctx.log( "Finding and inserting registration office sites into the database...") # Get all ngos from the database that don't have a registration ID/site and have a country # to get registration office site ngo_list = list_ngos_from_db(collection, registration=None, country={"$ne": None}) updated_list = [] # Keep track of all countries seen so far to reduce amount of scraping prev_countries = dict() # Update the document to have a registration site, check edge case of no country, which can't be assigned # a registration site for org in ngo_list: # Check if country has been previously scraped already if org[COUNTRY_FIELD].title() in prev_countries.keys(): org[REGISTRATION_FIELD] = [ prev_countries[org[COUNTRY_FIELD]] ] # Creates a list to easily add more registration IDs/fields updated_list.append(org) else: # Scrape for country code and add to dictionary registration_url = get_registration_site(org[COUNTRY_FIELD]) prev_countries[org[COUNTRY_FIELD].title()] = registration_url org[REGISTRATION_FIELD] = [registration_url] updated_list.append(org) # Check list of updated NGOs and only delete/insert NGOs that now have registration office sites for updated_org in updated_list: if updated_org[REGISTRATION_FIELD][0] != "": delete_one_ngo_from_db(collection, _id=ObjectId(updated_org["_id"])) else: updated_list.remove(updated_org) # Push updated documents to database ctx.log(upload_data(collection, {"data": updated_list}))
def cli(ctx, n): authenticate() collection = db_get_collection() search = "Finding scraper {} from list of registered scrapers..." ctx.log(search.format(n)) try: # Test scraper by sending request to test endpoint scrapers = list_scrapers_from_db(collection) scraper = list( filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == n, scrapers)) route = scraper[0]["_id"] + "/test" ctx.log("Scraper {} found!".format(n)) except Exception: ctx.log("Scraper {} not found.".format(n)) return contents = requests.get(route).text print(contents)
def cli(ctx): """ Submits all NGO data stored in the MLab database to the GlobalGiving API. TEMPORARY STUB: Currently, the data is just stored in `~/globalgiving/ngo_data.json` """ authenticate() collection = db_get_collection(NGO_COLLECTION) dotenv.load_dotenv(dotenv.find_dotenv()) # get all NGOs ngo_list = list_ngos_from_db(collection) # just write to a file for now if not os.path.exists(os.getenv("HOME") + CLI_DIR_NAME): os.makedirs(os.getenv("HOME") + CLI_DIR_NAME) with open(os.getenv("HOME") + CLI_DIR_NAME + "ngo_data.json", "w+") as f: f.write(json.dumps(ngo_list, indent=4, separators=(",", ": "))) ctx.log("NGO data was successfully submitted!")
def cli(ctx): authenticate() ranked_link = db_get_collection(CRAWL_RANKED_COLLECTION) cursor = ranked_link.find({}) directories = [_ for _ in cursor] ranked_ngo_directories = [] for directory in directories: ranked_ngo_directories += [(directory["url"], directory)] print("Ranked Set of NGO's gathered") for ngo_directory in ranked_ngo_directories: print(" " + str(ngo_directory[0])) rank_info = ngo_directory[1] print(" Has " + str(rank_info["num_phone_numbers"]) + " phone numbers") print(" Has " + str(rank_info["num_addresses"]) + " addresses") print(" Has " + str(rank_info["num_subpages"]) + " subpages") print(" Has " + str(rank_info["num_word_ngo"]) + " appearances of ngo directory related words")
def cli(ctx, name): authenticate() collection = db_get_collection() search = "Finding scraper {} from list of registered scrapers . . . " ctx.log(search.format(name)) try: scrapers = list_scrapers_from_db(collection) ngo_id = list( filter( lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(name), scrapers ) )[0]["_id"] ctx.log("Scraper {} found!".format(name)) except StopIteration: ctx.log("Scraper {} not found.".format(name)) return except IndexError: ctx.log("Scraper {} not found.".format(name)) return ctx.log("Deleting scraper {} . . . ".format(name)) return delete_scraper(collection, ngo_id)
def cli(ctx, scraper_name): authenticate() collection = db_get_collection() scrapers = list_scrapers_from_db(collection) # If you have a scraper name, find the url for that scraper if scraper_name: for scraper in scrapers: if scraper[SCRAPER_COLL_NAME_FIELD] == scraper_name: url = requests.get(scraper["_id"] + "/url").text if "http" in url: print(url) return # Otherwise list the names of all sites being scraped else: for scraper in scrapers: print("Scraper: " + scraper[SCRAPER_COLL_NAME_FIELD]) contents = requests.get(scraper["_id"] + "/url").text if "http" in contents: print(" " + contents) else: print(" ERROR: Scraper not available")
def cli(ctx, n, a): authenticate() collection = db_get_collection() ngo_collection = db_get_collection(NGO_COLLECTION) client = init_s3_credentials() if a: run_all(ctx) return # Create new bucket name for log file by using hash h = hashlib.md5() h.update(n.encode("utf-8")) bucket_name = n + "-" + h.hexdigest() # Generate unique file name for new log filename = str(uuid.uuid4()) + ".txt" f = open(filename, "w+") search = "Finding scraper {} from list of registered scrapers..." f.write(search.format(n) + "\n") try: scrapers = list_scrapers_from_db(collection) route_data = list( filter(lambda scraper: scraper[SCRAPER_COLL_NAME_FIELD] == str(n), scrapers) ) if len(route_data) == 0: print("Scraper not found") return route_data = route_data[0] route = route_data["_id"] + "/data" f.write("Scraper {} found!".format(n) + "\n") except StopIteration: f.write("Scraper {} not found!".format(n) + "\n") f.close() client.upload_file(filename, bucket_name, filename) os.remove(filename) return try: # Run scraper by getting the correct route and requesting it contents = requests.get(route).json() if "data" in contents: print("The data is uploaded") f.write(upload_data(ngo_collection, contents)) elif "pages" in contents: print("Fetching all " + str(contents["pages"]) + " pages") f.write("Fetching all " + str(contents["pages"]) + " pages") for i in range(int(contents["pages"])): try: url = str(route_data["_id"]) + "page" # TODO: Current hack-run the scraper on localhost, and it will get all the pages, there is a deployment bug at the moment # To run in locally, navigate to the directory for the scraper and run "python3 index.py" url = "http://localhost:5000/page" print("Fetching " + url) f.write("Fetching " + url) # contents = requests.get(route).json() payload = {"url": str(contents["urls"][i])} print(payload) data = requests.post(url, json=json.dumps(payload)) print(data.json()) f.write(upload_data(ngo_collection, data.json())) print("The data is uploaded") except Exception as e: print(e) f.write("Failed on page" + str(route)) continue else: print("The data recieved is not structured correctly") f.write("The data recieved is not structured correctly") except Exception as e: print("exception") print(e) contents = str(e) + "\nFAILED" f.write(contents) f.close() # Call S3 to list current buckets response = client.list_buckets() # Get a list of all bucket names from the response buckets = [bucket["Name"] for bucket in response["Buckets"]] if bucket_name not in buckets: client.create_bucket(Bucket=bucket_name) client.upload_file(filename, bucket_name, filename) os.remove(filename) ctx.log("Wrote logs to file: " + filename)
def run_all(ctx): authenticate() collection = db_get_collection() ngo_collection = db_get_collection(NGO_COLLECTION) client = init_s3_credentials() h = hashlib.md5() names = [] routes = [] log_files = [] log_filenames = [] try: scrapers = list_scrapers_from_db(collection) for scraper in scrapers: n = scraper[SCRAPER_COLL_NAME_FIELD] names.append(n) routes.append(scraper["_id"] + "/data") h.update(n.encode("utf-8")) bucket_name = n + "-" + h.hexdigest() filename = str(uuid.uuid4()) + ".txt" log_filenames.append(filename) log_files.append(open(filename, "w+")) except Exception as e: n = "all" # the name in this case is effectively all, then we can just # use the code from the single case h.update(n.encode("utf-8")) bucket_name = n + "-" + h.hexdigest() filename = str(uuid.uuid4()) + ".txt" f = open(filename, "w+") f.write("Scraper not found!" + "\n") f.write(str(e) + "\nFAILED") f.close() client.upload_file(filename, bucket_name, filename) os.remove(filename) ctx.log("Failed. See log at {} in bucket {}.".format(filename, bucket_name)) return # Call S3 to list current buckets to prepare for logging response = client.list_buckets() # Get a list of all bucket names from the response buckets = [bucket["Name"] for bucket in response["Buckets"]] for name, route, log, filename in zip(names, routes, log_files, log_filenames): try: ctx.log("Getting information for {} . . . ".format(name)) contents = requests.get(route).json() if "data" in contents: log.write(contents) log.write(upload_data(ngo_collection, contents)) log.write("Upload succeeded!") ctx.log("Uploading {} succeeded!".format(name)) else: ctx.log("Skipping this scraper") except Exception as e: log.write("Upload failed.") ctx.log("Uploading {} failed.".format(name)) finally: log.close() if bucket_name not in buckets: client.create_bucket(Bucket=bucket_name) client.upload_file(filename, bucket_name, filename) os.remove(filename) ctx.log("Wrote logs for {} to file: ".format(name) + filename)