def download_nbs(notebooks, local, current_files): """ Download notebooks from GitHub. Equivalent to Adam's 2_nb_download.ipynb. """ debug_print("Downloading notebooks\n") already_done = 0 checkpoints = 0 new = 0 count = 0 for _, row in notebooks.sort_values(by="days_since").iterrows(): date_string = datetime.datetime.now().strftime(r"%Y-%m-%d %H:%M:%S") # Keep track of the download progress. if count % COUNT_TRIGGER == 0 or count == len(notebooks): debug_print("{0} / {1} notebooks downloaded.".format( count, len(notebooks))) count += 1 # Don't download files we already have. # Don't download files in .ipynb_checkpoints. if row["file"] in current_files: already_done += 1 continue if ".ipynb_checkpoints" in row["html_url"]: checkpoints += 1 try: # Access the raw content webpage and download the file. raw_url = row["html_url"].replace( "github.com", "raw.githubusercontent.com").replace("/blob", "") r = requests.get(raw_url) # Save file. if local: filename = "../data/notebooks/{0}".format(row["file"]) with open(filename, "w") as nb_file: nb_file.write(r.text) else: obj = s3.Object("notebook-research", "notebooks/{0}".format(row["file"])) obj.put(Body=bytes(r.text.encode("UTF-8"))) new += 1 msg = "{0}: downloaded {1}".format(date_string, row["file"]) write_to_log("../logs/nb_log.txt", msg) except Exception: # Report missed files. msg = "{0}: had trouble downloading {1}".format( date_string, row["file"]) write_to_log("../logs/nb_log.txt", msg) debug_print(msg) debug_print("{0} were already done. {1} were in ipynb checkpoints. {2} ". format(already_done, checkpoints, new) + "new notebooks were downloaded.")
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Saves output locally instead of in S3.") args = parser.parse_args() local = args.local # Open DataFrames. try: if local: notebooks = pd.read_csv("{0}/notebooks1.csv".format(PATH)) repos = pd.read_csv("{0}/repos1.csv".format(PATH)) owners = pd.read_csv("{0}/owners1.csv".format(PATH)) else: notebooks = s3_to_df("csv/notebooks1.csv") owners = s3_to_df("csv/owners1.csv") repos = s3_to_df("csv/repos1.csv") print("notebooks1.csv, repos1.csv, owners1.csv found and opened.") except Exception: print("notebooks1.csv, repos1.csv, owners1.csv not found.", "Please run query_git.py first and try again.") sys.exit(0) # Randomize notebook distribution among workers. already_done = parallelize_download(notebooks, repos, owners, local) # List files already downloaded, used in download.py if not already_done: current_files = set([]) for obj in bucket.objects.filter(Prefix='notebooks/'): current_files.add(obj.key.split("/")[1]) obj = s3.Object("notebook-research", "current_notebooks.pickle") obj.put(Body=bytes(pickle.dumps(current_files))) print("saved current files") # Format commands. download_commands = [ ("nohup python3 -u download.py --worker {0}{1}" " > download_{2}.log &").format(i, (' --local' if local else ''), i) for i in range(NUM_WORKERS) ] for command in download_commands: print(command) os.system(command) time.sleep(10)
def clean_metadata(num_needed, updating, local): """ Extract information from metadata JSON files and save to CSVs. Equivalent to Adam's 1_nb_metadata_cleaning.ipynb. """ try: if local: pass else: notebooks_done = s3_to_df("csv/notebooks1.csv") owners_done = s3_to_df("csv/owners1.csv") repos_done = s3_to_df("csv/repos1.csv") notebook_files_done = set(notebooks_done.file) owner_ids_done = set(owners_done.owner_id) repo_ids_done = set(repos_done.repo_id) print( 'Metadata already processed for {0} notebooks, {1} owners, and {2} repos.' .format(len(notebook_files_done), len(owner_ids_done), len(repo_ids_done))) except: notebook_files_done = [] owner_ids_done = [] repo_ids_done = [] print("Metadata not processed for any files.") # Get all query files. if local: nb_search_files = os.listdir(JSON_PATH) else: nb_search_files = list_s3_dir('json/') # Sort query files by size then by page number. nb_search_files = sorted(nb_search_files, key=lambda x: (int(x.split("_")[2].split("..")[0]), int(x.split("_")[3][1:].split(".")[0]))) debug_print("We have {0} query files.".format(len(nb_search_files))) notebooks = {} repos = {} owners = {} for j, json_file_name in enumerate(nb_search_files): # Keep track of progress. if (j + 1) % COUNT_TRIGGER / 100 == 0 or j + 1 == len(nb_search_files): debug_print("{0} / {1} data files processed".format( j + 1, len(nb_search_files))) file_components = json_file_name.replace(".json", "").split("_") filesize = file_components[2] query_page = int(file_components[3][1:]) if local: with open(JSON_PATH + json_file_name, "r") as json_file: # Parse file name to get size and query page. file_dict = json.load(json_file) else: obj = s3.Object("notebook-research", "json/{0}".format(json_file_name)) file_dict = json.loads(obj.get()["Body"].read().decode("UTF-8")) # Report missing data. if "incomplete_results" in file_dict: if file_dict["incomplete_results"] == True: msg = "{0} has incomplete results".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) days_since = file_dict["days_since"] if "items" in file_dict: if len(file_dict["items"]) == 0: msg = "{0} has 0 items".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) else: # Save data for each item. for i in range(len(file_dict["items"])): item = file_dict["items"][i] item_repo = item["repository"] repo_id = item_repo["id"] owner_id = item_repo["owner"]["id"] # Don"t save forked notebooks. if item_repo["fork"]: continue # Full path is unique for each file. name = "{0}/{1}/{2}".format(item_repo["owner"]["login"], item_repo["name"], item["path"]).replace( "/", "..") if name not in notebook_files_done: notebook = { "file": name, "html_url": item["html_url"], "name": item["name"], "path": item["path"], "repo_id": repo_id, "owner_id": owner_id, "filesize": filesize, "query_page": query_page, "days_since": days_since } notebooks[name] = notebook if repo_id not in repos and repo_id not in repo_ids_done: repo = { "repo_name": item_repo["name"], "owner_id": owner_id, "repo_description": item_repo["description"], "repo_fork": item_repo["fork"], "repo_html_url": item_repo["html_url"], "repo_private": item_repo["private"], } repos[repo_id] = repo if owner_id not in owners and owner_id not in owner_ids_done: owner = { "owner_html_url": item_repo["owner"]["html_url"], "owner_login": item_repo["owner"]["login"], } owners[owner_id] = owner # If updating we dont always need the full page. if updating and len(notebooks) == num_needed: break else: msg = "{0} has no items object".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) if updating and len(notebooks) == num_needed: break # Display status debug_print(("\nAfter processing all query files, " "we have {0} new notebooks.").format(len(notebooks))) debug_print("Written by {0} owners.".format(len(owners))) debug_print("Held in {0} repositories.".format(len(repos))) # Translate dictionaries to DataFrames and save to CSV. # Ordered by days since, if duplicates keep the most recent # (i.e. keep last, which was found more days since 1-1-19). notebooks_df = pd.DataFrame(notebooks).transpose()\ .sort_values(by=["days_since","file"]).drop_duplicates( subset =["file"], keep="last" ) owners_df = pd.DataFrame(owners).transpose().reset_index().rename( columns={"index": "owner_id"}, index=str) repos_df = pd.DataFrame(repos).transpose().reset_index().rename( columns={"index": "repo_id"}, index=str) if local: pd.concat([notebooks_df, notebooks_done]).to_csv("{0}/notebooks1.csv".format(PATH), index=False) pd.concat([owners_df, owners_done]).to_csv("{0}/owners1.csv".format(PATH), index=False) pd.concat([repos_df, repos_done]).to_csv("{0}/repos1.csv".format(PATH), index=False) else: df_to_s3(pd.concat([notebooks_df, notebooks_done]), "csv/notebooks1.csv") df_to_s3(pd.concat([owners_df, owners_done]), "csv/owners1.csv") df_to_s3(pd.concat([repos_df, repos_done]), "csv/repos1.csv")
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument( "-w", "--worker", metavar="N", type=int, help=("GITHUB_TOKEN assigned to these files " + "(will use partition N stored in download_partitions.pickle).")) parser.add_argument("-r", "--repos", action="store_const", dest="only_repos", const=True, default=False, help="Download repos only.") parser.add_argument("-n", "--notebooks", action="store_const", dest="only_nbs", const=True, default=False, help="Download notebooks only.") parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Save data locally instead of in S3.") args = parser.parse_args() worker = args.worker only_repos = args.only_repos only_nbs = args.only_nbs local = args.local # If both flags are specified, ignore. if only_repos and only_nbs: raise Exception( "Cannot use both --repos and --notebooks flags. Use --help flag for more information." ) # If a worker was specified, get partition data and correct header. if worker != None: print("Worker {0}".format(worker)) try: if local: with open("download_partitions.pickle", "rb") as f: partitions_download = pickle.load(f) partition = partitions_download[worker] else: obj = s3.Object( "notebook-research", "download_partitions_{0}.pickle".format(worker)) partition = pickle.load(BytesIO(obj.get()["Body"].read())) except Exception: print(("Download Partitions data were not found {0}. ".format( "locally" if local else "in s3") + "Please run parallelize_download.py and try again.")) sys.exit(0) notebooks1 = partition["notebooks"] obj = s3.Object("notebook-research", "csv/owners1.csv") owners = pd.read_csv(BytesIO(obj.get()["Body"].read())) obj = s3.Object("notebook-research", "csv/repos1.csv") repos = pd.read_csv(BytesIO(obj.get()["Body"].read())) owners1 = notebooks1[["owner_id" ]].merge(owners[['owner_id', 'owner_login']], on="owner_id", how='left').drop_duplicates() repos1 = notebooks1[["repo_id", 'owner_id']].merge(repos[['repo_id', 'repo_name']], on='repo_id', how='left').drop_duplicates() header = HEADERS[partition["id"]] debug_print("Partition data for downloads found and opened. " + "Notebooks1, Owners1, and Repos1 were found and opened." + BREAK) # If a worker was not specified, get all data and use first header. else: try: if local: notebooks1 = pd.read_csv("{0}/notebooks1.csv".format(PATH)) owners1 = pd.read_csv("{0}/owners1.csv".format(PATH)) repos1 = pd.read_csv("{0}/repos1.csv".format(PATH)) else: notebooks1 = s3_to_df("csv/notebooks1.csv") owners1 = s3_to_df("csv/owners1.csv") repos1 = s3_to_df("csv/repos1.csv") except Exception: print("The files 'notebooks1.csv','repos1.csv', and " + "'owners1.csv' were not found. Please run query_git.py " + "and try again.") sys.exit(0) header = HEADERS[0] # Check time and display status. print("{0} notebooks, {1} repos, {2} owners".format( len(notebooks1), len(repos1), len(owners1))) check1 = datetime.datetime.now() write_to_log("../logs/timing.txt", "download CHECKPOINT 1: {0}".format(check1)) # Download full notebooks from github. if not only_repos: if local: current_files = set(os.listdir("../data/notebooks")) else: obj = s3.Object("notebook-research", "current_notebooks.pickle") current_files = pickle.load(BytesIO(obj.get()["Body"].read())) num_done = len(current_files) debug_print( "{0} notebooks have already been downloaded.".format(num_done)) download_nbs(notebooks1, local, current_files) check2 = datetime.datetime.now() write_to_log("../logs/timing.txt", "CHECKPOINT 2: {0}".format(check2)) debug_print("\nNotebooks have been downloaded. Time: {0}{1}".format( check2 - check1, BREAK)) # Download data from github. if not only_nbs: download_repo_data(repos1, owners1, header, local) check3 = datetime.datetime.now() write_to_log("../logs/timing.txt", "CHECKPOINT 3: {0}".format(check3)) debug_print("\nRepos have been downloaded. " + "Time: {0}{1}".format(check3 - check2, BREAK))
def download_repo_data(repos, owners, header, local): """ Download repository metadata files from GitHub. """ if len(repos) == 0 or len(owners) == 0: return data_frame = repos.merge(owners, on="owner_id") # List files already downloaded. current_repos = os.listdir("../data/repos") if local else list_s3_dir( "repos/") debug_print(("There are currently {0} repo metadata files saved.").format( len(current_repos))) num_recorded_since = 0 num_tried_since = 0 hit_url = '' for i, row in data_frame.iterrows(): num_tried_since += 1 # Keep track of the download progress. if i % COUNT_TRIGGER == 0 or i == len(data_frame): debug_print("{0} / {1} repos downloaded.".format( i, len(data_frame))) # Download repository metadata. repo_recorded = False if "repo_{0}.json".format(row["repo_id"]) not in current_repos: wait_time = 0 while not repo_recorded: time.sleep(wait_time) date_string = datetime.datetime.now().strftime( r"%Y-%m-%d %H:%M:%S") url = "https://api.github.com/repos/{0}/{1}".format( row["owner_login"], row["repo_name"]) try: # Query the api. r = requests.get(url, headers=header) j = r.json() h = r.headers # Handle rate limiting. if h["Status"] == "403 Forbidden": debug_print( "{0}: Hit rate limit. Retry at {1}. {2} tried and {3} saved since last hit." .format(h["Date"], time.ctime(int(h["X-RateLimit-Reset"])), num_tried_since, num_recorded_since)) if hit_url == url: print('Same one again, skipping') repo_recorded = True continue wait_time = int( h["X-RateLimit-Reset"]) - time.time() + 1 num_tried_since = 0 num_recorded_since = 0 hit_url = url continue if "message" in j and (j["message"] == "Not Found" or j["message"] == "Bad credentials"): print(url, 'Message:', j['message']) raise Exception # Save JSON File. else: if local: filename = "../data/repos/repo_{0}.json".format( row["repo_id"]) with open(filename, "w") as repo_file: json.dump(j, repo_file) else: obj = s3.Object( "notebook-research", "repos/repo_{0}.json".format(row["repo_id"])) obj.put(Body=bytes(json.dumps(j).encode("UTF-8"))) # Report Status. msg = "{0}: downloaded repo {1}".format( date_string, row["repo_id"]) write_to_log("../logs/repo_metadata_query_log.txt", msg) repo_recorded = True wait_time = 0 num_recorded_since += 1 except Exception as e: # Report missed files. msg = "{0}: had trouble downloading repo {1}".format( date_string, row["repo_id"]) write_to_log("../logs/repo_metadata_query_log.txt", msg) debug_print(msg) debug_print(e) repo_recorded = True
def save_page( url, size, header, query_status, saved_urls, current_notebooks ): """ Save results page to json file. """ # Set inital rate limiting management variables. limit_status = { "reset_time": time.time(), "limited": False, "wait_time": 0, "remaining_queries": 30 } # Query GitHub API. try: r = requests.get(url, headers = header) j = r.json() h = r.headers except requests.exceptions.Timeout: debug_print("Request timeout.") r = None limit_status["limited"] = True limit_status["wait_time"] = 60 return r, limit_status, query_status # Handle 403 error if we have hit query rate. if "Status" not in h or h["Status"] == "403 Forbidden": try: debug_print( "{0}: Hit rate limit. Retry after {1} seconds".format( h["Date"], h["Retry-After"] ) ) # Set to limited and update wait time. limit_status["limited"] = True limit_status["wait_time"] = int(h["Retry-After"]) except Exception: # Default wait time to 1 minute. limit_status["limited"] = True limit_status["wait_time"] = 60 return r, limit_status, query_status # Update rate limiting management variables. date = r.headers["Date"] query_status["num_results"] = int(j["total_count"]) limit_status["remaining_queries"] = h["X-RateLimit-Remaining"] limit_status["reset_time"] = int(h["X-RateLimit-Reset"]) # Write progress to log and display status. log_string = "{0}: {1} bytes {2} results".format( date, size, query_status["num_results"] ) write_to_log("../logs/nb_metadata_query_log.txt", log_string) debug_print(log_string) # Check if query result is acceptable. if ( query_status["num_results"] <= QUERY_CUTOFF or query_status["page"] > 1 or size.split("..")[0] == size.split("..")[1] ): # Add days since. diff = datetime.datetime.now() - datetime.datetime(2019,1,1) j["days_since"] = (diff.days + (diff.seconds + diff.microseconds/(10**6))/(60*60*24) ) # Save this page. filename = "github_notebooks_{0}_p{1}.json".format( size, query_status["page"] ) if query_status["updating"]: filename = "github_notebooks_{0}_p{1}_{2}.json".format( size, query_status["page"], datetime.datetime.now() ) if query_status["local"]: with open(JSON_PATH+filename, "w") as json_file: json.dump(j, json_file) else: obj = s3.Object("notebook-research","json/"+filename) obj.put(Body = bytes(json.dumps(j).encode("UTF-8"))) # Display status. debug_print("Saved {0} bytes, p{1}".format(size, query_status["page"])) for item in j["items"]: # If updating, done if this html_url has already been downloaded. if query_status["updating"] and "file" in item: html_url = item["html_url"].replace("#","%23") file_name = item["file"] # If the same version of an existing notebook, done. if html_url in saved_urls: debug_print(("This notebook has already been " "downloaded! Stop looking here.") ) query_status["another_page"] = False query_status["done"] = True break # If new version of an existing notebook, delete old. elif file_name in current_notebooks: if query_status["local"]: os.remove("../data/notebooks/{0}".format(file_name)) else: s3.Object( "notebook-research", "notebooks/{0}".format(file_name) ).delete() # If we"ve retrieved num_results notebooks, we"re done. path = item["repository"]["full_name"] + "/" + item["path"] query_status["all_items"].append(path) if len(query_status["all_items"]) == query_status["num_results"]: query_status["another_page"] = False query_status["done"] = True break query_status["num_needed"] += 1 # Write progress to log adn display log_string = "{0}: {1} bytes p{2} {3} items".format( date, size, query_status["page"], len(j["items"]) ) write_to_log("../logs/nb_metadata_query_log.txt", log_string) debug_print(log_string) # if less than 100 items on the page, it"s the last page # at most 10 pages if len(j["items"]) < 100 or query_status["page"] == 10: query_status["done"] = True return r, limit_status, query_status
def s3_to_df(path): df_obj = s3.Object("notebook-research", path) return pd.read_csv(BytesIO(df_obj.get()["Body"].read()), header = 0)
def df_to_s3(data_frame, path): obj = s3.Object("notebook-research", path) csv_buffer = StringIO() data_frame.to_csv(csv_buffer, index = False) obj.put(Body=csv_buffer.getvalue())
def get_all_nb_cells(notebooks, local, done): """ Get cell and notebook data for each notebook. """ new_nb_info = {} all_cells_info = {} missing = [] for count, row in notebooks.iterrows(): # Track progress. file_name = row["file"] data = None if count % COUNT_TRIGGER == 0 or count == len(notebooks) - 1: print("{0} / {1} notebooks processed for cell data".format( count, len(notebooks) + done)) # Save data and reset. (In chunks to avoid MemoryError). if count > 0: # Transform data to DataFrame. notebooks_temp = pd.DataFrame(new_nb_info).transpose() cells_temp = pd.DataFrame( all_cells_info).transpose().reset_index(drop=True) # Save data to CSV. try: if local: notebooks_temp.to_csv( "{0}/notebooks2_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) cells_temp.to_csv("{0}/cells1_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( notebooks_temp, "{0}/notebooks2_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( cells_temp, "{0}/cells1_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) except MemoryError: # Split data into 3 sections and try saving again. n1 = notebooks_temp.iloc[:len(notebooks_temp) // 4] n2 = notebooks_temp.iloc[len(notebooks_temp) // 4:2 * len(notebooks_temp) // 4] n3 = notebooks_temp.iloc[2 * len(notebooks_temp) // 4:3 * len(notebooks_temp) // 4] n4 = notebooks_temp.iloc[3 * len(notebooks_temp) // 4:] c1 = cells_temp.iloc[:len(cells_temp) // 8] c2 = cells_temp.iloc[len(cells_temp) // 8:2 * len(cells_temp) // 8] c3 = cells_temp.iloc[2 * len(cells_temp) // 8:3 * len(cells_temp) // 8] c4 = cells_temp.iloc[3 * len(cells_temp) // 8:4 * len(cells_temp) // 8] c5 = cells_temp.iloc[4 * len(cells_temp) // 8:5 * len(cells_temp) // 8] c6 = cells_temp.iloc[5 * len(cells_temp) // 8:6 * len(cells_temp) // 8] c7 = cells_temp.iloc[6 * len(cells_temp) // 8:7 * len(cells_temp) // 8] c8 = cells_temp.iloc[7 * len(cells_temp) // 8:] if local: n1.to_csv("{0}/notebooks2_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n2.to_csv("{0}/notebooks2_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n3.to_csv("{0}/notebooks2_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n4.to_csv("{0}/notebooks2_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c1.to_csv("{0}/cells1_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c2.to_csv("{0}/cells1_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c3.to_csv("{0}/cells1_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c4.to_csv("{0}/cells1_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c5.to_csv("{0}/cells1_{1}_{2}_5.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c6.to_csv("{0}/cells1_{1}_{2}_6.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c7.to_csv("{0}/cells1_{1}_{2}_7.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c8.to_csv("{0}/cells1_{1}_{2}_8.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( n1, "{0}/notebooks2_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n2, "{0}/notebooks2_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n3, "{0}/notebooks2_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n4, "{0}/notebooks2_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c1, "{0}/cells1_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c2, "{0}/cells1_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c3, "{0}/cells1_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c4, "{0}/cells1_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c5, "{0}/cells1_{1}_{2}_5.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c6, "{0}/cells1_{1}_{2}_6.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c7, "{0}/cells1_{1}_{2}_7.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c8, "{0}/cells1_{1}_{2}_8.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) # Empty current dictionaries. new_nb_info = {} all_cells_info = {} print("CSVs saved") # Initialize row of data. nb_info = { "file": file_name, "google_collab": False, "nbformat": "", "nbformat_minor": "", "num_cells": 0, "kernel_lang": "", "kernel_name": "", "lang_name": "", "lang_version": "" } # Open notebooks as json. try: obj = s3.Object("notebook-research", "notebooks/{0}".format(file_name)) data = json.loads(obj.get()["Body"].read().decode("UTF-8")) except Exception: # Report missed files. msg = "Notebook {0} did not open.".format(file_name) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) missing.append(file_name) # Add row with missing values. if file_name not in new_nb_info: new_nb_info[file_name] = nb_info continue # If data was able to load as JSON, extract information. if data and isinstance(data, dict): keys = data.keys() # Get nb top level format metadata. if "nbformat" in keys: nb_info["nbformat"] = data["nbformat"] if "nbformat_minor" in keys: nb_info["nbformat_minor"] = data["nbformat_minor"] # Get info from the metadata dictionary. if ("metadata" in keys and data["metadata"] != None and isinstance(data["metadata"], dict)): metadata_keys = data["metadata"].keys() # Access language data. if ("kernelspec" in metadata_keys and data["metadata"]["kernelspec"] != None and isinstance(data["metadata"]["kernelspec"], dict)): kernel_keys = data["metadata"]["kernelspec"].keys() # If Google colab notebook, only Python 2.7 or 3.6 are possible. if "colab" in metadata_keys: nb_info["google_collab"] = True if ("name" in kernel_keys and "display_name" in kernel_keys): nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["name"] nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] if nb_info["kernel_lang"] == "python3": nb_info["lang_name"] = "python" nb_info["lang_version"] = "3.6" elif nb_info["kernel_lang"] == "python2": nb_info["lang_name"] = "python" nb_info["lang_version"] = "2.7" # Not Google colab, access kernel language and display name. else: if "language" in kernel_keys: nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["language"] if "display_name" in kernel_keys: nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] # Access language info. if ("language_info" in metadata_keys and "colab" not in metadata_keys): lang_keys = data["metadata"]["language_info"].keys() if "name" in lang_keys and "colab" not in metadata_keys: nb_info["lang_name"] = data["metadata"][ "language_info"]["name"] if "version" in lang_keys and "colab" not in metadata_keys: nb_info["lang_version"] = data["metadata"][ "language_info"]["version"] elif "language" in metadata_keys: nb_info["lang_name"] = data["metadata"]["language"] # Get information about individual cells. cells_info = {} if "cells" in keys: nb_info["num_cells"] = len(data["cells"]) cell_id = 0 for cell in data["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 elif "worksheets" in keys: cell_id = 0 for w in data["worksheets"]: for cell in w["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 all_cells_info.update(cells_info) if file_name not in new_nb_info: new_nb_info[file_name] = nb_info debug_print("{0} notebooks are missing cell data.".format(len(missing))) return new_nb_info, all_cells_info
def update_owners_repos(owners, repos, local): """ Add information on Owners and Repos""" new_repo_info = {} new_owner_info = {} repo_ids = list(repos.repo_id) missing = 0 forked = 0 moved = 0 for i, repo_id in enumerate(repo_ids): repo_json = None # Keep track of progress. if i % COUNT_TRIGGER == 0: debug_print("{0} / {1} repo data files processed".format( i, len(repo_ids))) try: obj = s3.Object("notebook-research", "repos/repo_{0}.json".format(repo_id)) repo_json = json.loads(obj.get()["Body"].read().decode("UTF-8")) except Exception: missing += 1 # Report missed files. msg = "Repo {0} metadata did not process.".format(repo_id) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if repo_json != None: if "message" in repo_json and (repo_json["message"] == "Not Found" or repo_json["message"] == "Bad credentials"): # Report missed files. missing += 1 msg = "Repo {0} metadata file did not download well.".format( repo_id) # Move bad file s3.Object( 'notebook-research', 'repos_bad/repo_{0}.json'.format(repo_id)).copy_from( CopySource='notebook-research/repos/repo_{0}.json'. format(repo_id)) s3.Object('notebook-research', 'repos/repo_{0}.json'.format(repo_id)).delete() moved += 1 write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if "owner" in repo_json: owner_id = repo_json["owner"]["id"] else: # Report missed files. msg = "Repo {0} metadata file not complete.".format(repo_id) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if not repo_json["fork"]: # Add repo info. repo_info = { "repo_id": repo_id, "language": repo_json["language"], "forks_count": repo_json["forks_count"], "stargazers_count": repo_json["stargazers_count"], "watchers_count": repo_json["watchers_count"], "subscribers_count": repo_json["subscribers_count"], "size": repo_json["size"], "open_issues_count": repo_json["open_issues_count"], "has_issues": repo_json["has_issues"], "has_wiki": repo_json["has_wiki"], "has_pages": repo_json["has_pages"], "has_downloads": repo_json["has_downloads"], "pushed_at": repo_json["pushed_at"], "created_at": repo_json["created_at"], "updated_at": repo_json["updated_at"] } if repo_id not in new_repo_info: new_repo_info[repo_id] = repo_info # Add owner info owner_info = { "owner_id": owner_id, "type": repo_json["owner"]["type"] } if owner_id not in new_owner_info: new_owner_info[owner_id] = owner_info else: forked += 1 else: missing += 1 # Display status. debug_print("We have {0} new repos.".format(len(new_repo_info))) debug_print("Couldn't process {0} files.".format(missing)) debug_print("{0} new repos were forked.".format(forked)) debug_print("{0} files had to be moved".format(moved)) # Translate dictionaries to DataFrames. if len(new_owner_info) > 0 and len(new_repo_info) > 0: updated_owners = owners.merge( pd.DataFrame(new_owner_info).transpose().reset_index(drop=True), on="owner_id") updated_repos = repos.merge( pd.DataFrame(new_repo_info).transpose().reset_index(drop=True), on="repo_id") else: updated_owners = [] updated_repos = [] return updated_owners, updated_repos
def parallelize_download(notebooks, repos, owners, local): # Open existing partitions if they are present. try: if local: f = open("download_partitions.pickle", "rb") partitions = pickle.load(f) f.close() else: partitions = [] for i in range(NUM_WORKERS): obj = s3.Object("notebook-research", "download_partitions_{0}.pickle".format(i)) partitions.append( pickle.load(BytesIO(obj.get()['Body'].read()))) print("Paritions opened") # List already partitioned notebooks notebooks_partitioned = [] for partition in partitions: notebooks_partitioned += list(partition['notebooks']['file']) debug_print("{0} notebooks have already been partitioned.".format( len(notebooks_partitioned))) # Isolate notebooks not yet partitioned notebooks_new = notebooks[~notebooks.file.isin(notebooks_partitioned)] if len(notebooks_new) == 0: print("All notebooks have already been partitioned.") return True except Exception as e: print(e) # All notebooks are new notebooks_new = notebooks partitions = [] for i in range(NUM_WORKERS): partitions.append({ "id": i, "notebooks": [], "repos": [], "owners": [] }) # Shuffle new notebooks notebooks_new = notebooks_new.sample(frac=1).reset_index(drop=True) # Randomly assign notebooks and the repos/owners that go with them. partition_notebooks = np.array_split(notebooks_new, NUM_WORKERS) for i in range(NUM_WORKERS): p = partitions[i] # Add new notebooks, repos, and owners to partitions if len(p["notebooks"]) > 0: p["notebooks"] = pd.concat([ p["notebooks"], # existing notebooks partition_notebooks[i] # new notebooks ]) else: p["notebooks"] = partition_notebooks[i] if len(p["repos"]) > 0: p["repos"] = pd.concat([ p["repos"], repos[repos.repo_id.isin( partition_notebooks[i]["repo_id"])].reset_index(drop=True) ]) else: p["repos"] = repos[repos.repo_id.isin( partition_notebooks[i]["repo_id"])].reset_index(drop=True) if len(p["owners"]) > 0: p["owners"] = pd.concat([ p["owners"], owners[owners.owner_id.isin( partition_notebooks[i]["owner_id"])].reset_index(drop=True) ]) else: p["owners"] = owners[owners.owner_id.isin( partition_notebooks[i]["owner_id"])].reset_index(drop=True) print('done with', i) # Save partition data. print('saving...') if local: f = open("download_partitions.pickle", "wb") pickle.dump(partitions, f) f.close() else: for i in range(len(partitions)): obj = s3.Object("notebook-research", "download_partitions_{0}.pickle".format(i)) obj.put(Body=bytes(pickle.dumps(partitions[i]))) print('...saved') return False