Пример #1
0
def download_nbs(notebooks, local, current_files):
    """ 
    Download notebooks from GitHub.
    Equivalent to Adam's 2_nb_download.ipynb.
    """
    debug_print("Downloading notebooks\n")
    already_done = 0
    checkpoints = 0
    new = 0
    count = 0

    for _, row in notebooks.sort_values(by="days_since").iterrows():
        date_string = datetime.datetime.now().strftime(r"%Y-%m-%d %H:%M:%S")

        # Keep track of the download progress.
        if count % COUNT_TRIGGER == 0 or count == len(notebooks):
            debug_print("{0} / {1} notebooks downloaded.".format(
                count, len(notebooks)))

        count += 1

        # Don't download files we already have.
        # Don't download files in .ipynb_checkpoints.
        if row["file"] in current_files:
            already_done += 1
            continue
        if ".ipynb_checkpoints" in row["html_url"]:
            checkpoints += 1

        try:
            # Access the raw content webpage and download the file.
            raw_url = row["html_url"].replace(
                "github.com",
                "raw.githubusercontent.com").replace("/blob", "")
            r = requests.get(raw_url)

            # Save file.
            if local:
                filename = "../data/notebooks/{0}".format(row["file"])
                with open(filename, "w") as nb_file:
                    nb_file.write(r.text)
            else:
                obj = s3.Object("notebook-research",
                                "notebooks/{0}".format(row["file"]))
                obj.put(Body=bytes(r.text.encode("UTF-8")))

            new += 1
            msg = "{0}: downloaded {1}".format(date_string, row["file"])
            write_to_log("../logs/nb_log.txt", msg)

        except Exception:
            # Report missed files.
            msg = "{0}: had trouble downloading {1}".format(
                date_string, row["file"])
            write_to_log("../logs/nb_log.txt", msg)
            debug_print(msg)

    debug_print("{0} were already done. {1} were in ipynb checkpoints. {2} ".
                format(already_done, checkpoints, new) +
                "new notebooks were downloaded.")
Пример #2
0
def clean_metadata(num_needed, updating, local):
    """ 
    Extract information from metadata JSON files and save to CSVs. 
    Equivalent to Adam's 1_nb_metadata_cleaning.ipynb.
    """

    try:
        if local:
            pass
        else:
            notebooks_done = s3_to_df("csv/notebooks1.csv")
            owners_done = s3_to_df("csv/owners1.csv")
            repos_done = s3_to_df("csv/repos1.csv")

        notebook_files_done = set(notebooks_done.file)
        owner_ids_done = set(owners_done.owner_id)
        repo_ids_done = set(repos_done.repo_id)

        print(
            'Metadata already processed for {0} notebooks, {1} owners, and {2} repos.'
            .format(len(notebook_files_done), len(owner_ids_done),
                    len(repo_ids_done)))

    except:
        notebook_files_done = []
        owner_ids_done = []
        repo_ids_done = []

        print("Metadata not processed for any files.")

    # Get all query files.
    if local:
        nb_search_files = os.listdir(JSON_PATH)
    else:
        nb_search_files = list_s3_dir('json/')

    # Sort query files by size then by page number.
    nb_search_files = sorted(nb_search_files,
                             key=lambda x:
                             (int(x.split("_")[2].split("..")[0]),
                              int(x.split("_")[3][1:].split(".")[0])))

    debug_print("We have {0} query files.".format(len(nb_search_files)))

    notebooks = {}
    repos = {}
    owners = {}

    for j, json_file_name in enumerate(nb_search_files):
        # Keep track of progress.
        if (j + 1) % COUNT_TRIGGER / 100 == 0 or j + 1 == len(nb_search_files):
            debug_print("{0} / {1} data files processed".format(
                j + 1, len(nb_search_files)))

        file_components = json_file_name.replace(".json", "").split("_")
        filesize = file_components[2]
        query_page = int(file_components[3][1:])

        if local:
            with open(JSON_PATH + json_file_name, "r") as json_file:
                # Parse file name to get size and query page.
                file_dict = json.load(json_file)
        else:
            obj = s3.Object("notebook-research",
                            "json/{0}".format(json_file_name))
            file_dict = json.loads(obj.get()["Body"].read().decode("UTF-8"))

        # Report missing data.
        if "incomplete_results" in file_dict:
            if file_dict["incomplete_results"] == True:
                msg = "{0} has incomplete results".format(json_file_name)
                write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

        days_since = file_dict["days_since"]
        if "items" in file_dict:
            if len(file_dict["items"]) == 0:
                msg = "{0} has 0 items".format(json_file_name)
                write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

            else:
                # Save data for each item.
                for i in range(len(file_dict["items"])):
                    item = file_dict["items"][i]
                    item_repo = item["repository"]
                    repo_id = item_repo["id"]
                    owner_id = item_repo["owner"]["id"]

                    # Don"t save forked notebooks.
                    if item_repo["fork"]:
                        continue

                    # Full path is unique for each file.
                    name = "{0}/{1}/{2}".format(item_repo["owner"]["login"],
                                                item_repo["name"],
                                                item["path"]).replace(
                                                    "/", "..")

                    if name not in notebook_files_done:
                        notebook = {
                            "file": name,
                            "html_url": item["html_url"],
                            "name": item["name"],
                            "path": item["path"],
                            "repo_id": repo_id,
                            "owner_id": owner_id,
                            "filesize": filesize,
                            "query_page": query_page,
                            "days_since": days_since
                        }
                        notebooks[name] = notebook

                    if repo_id not in repos and repo_id not in repo_ids_done:
                        repo = {
                            "repo_name": item_repo["name"],
                            "owner_id": owner_id,
                            "repo_description": item_repo["description"],
                            "repo_fork": item_repo["fork"],
                            "repo_html_url": item_repo["html_url"],
                            "repo_private": item_repo["private"],
                        }
                        repos[repo_id] = repo

                    if owner_id not in owners and owner_id not in owner_ids_done:
                        owner = {
                            "owner_html_url": item_repo["owner"]["html_url"],
                            "owner_login": item_repo["owner"]["login"],
                        }
                        owners[owner_id] = owner

                    # If updating we dont always need the full page.
                    if updating and len(notebooks) == num_needed:
                        break
        else:
            msg = "{0} has no items object".format(json_file_name)
            write_to_log("../logs/nb_metadata_cleaning_log.txt", msg)

        if updating and len(notebooks) == num_needed:
            break

    # Display status
    debug_print(("\nAfter processing all query files, "
                 "we have {0} new notebooks.").format(len(notebooks)))
    debug_print("Written by {0} owners.".format(len(owners)))
    debug_print("Held in {0} repositories.".format(len(repos)))

    # Translate dictionaries to DataFrames and save to CSV.
    # Ordered by days since, if duplicates keep the most recent
    # (i.e. keep last, which was found more days since 1-1-19).
    notebooks_df = pd.DataFrame(notebooks).transpose()\
        .sort_values(by=["days_since","file"]).drop_duplicates(
            subset =["file"],
            keep="last"
        )
    owners_df = pd.DataFrame(owners).transpose().reset_index().rename(
        columns={"index": "owner_id"}, index=str)
    repos_df = pd.DataFrame(repos).transpose().reset_index().rename(
        columns={"index": "repo_id"}, index=str)

    if local:
        pd.concat([notebooks_df,
                   notebooks_done]).to_csv("{0}/notebooks1.csv".format(PATH),
                                           index=False)
        pd.concat([owners_df,
                   owners_done]).to_csv("{0}/owners1.csv".format(PATH),
                                        index=False)
        pd.concat([repos_df, repos_done]).to_csv("{0}/repos1.csv".format(PATH),
                                                 index=False)
    else:
        df_to_s3(pd.concat([notebooks_df, notebooks_done]),
                 "csv/notebooks1.csv")
        df_to_s3(pd.concat([owners_df, owners_done]), "csv/owners1.csv")
        df_to_s3(pd.concat([repos_df, repos_done]), "csv/repos1.csv")
Пример #3
0
def main():
    # Parse command line arguments.
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-w",
        "--worker",
        metavar="N",
        type=int,
        help=("GITHUB_TOKEN assigned to these files " +
              "(will use partition N stored in download_partitions.pickle)."))
    parser.add_argument("-r",
                        "--repos",
                        action="store_const",
                        dest="only_repos",
                        const=True,
                        default=False,
                        help="Download repos only.")
    parser.add_argument("-n",
                        "--notebooks",
                        action="store_const",
                        dest="only_nbs",
                        const=True,
                        default=False,
                        help="Download notebooks only.")
    parser.add_argument("--local",
                        action="store_const",
                        dest="local",
                        const=True,
                        default=False,
                        help="Save data locally instead of in S3.")
    args = parser.parse_args()
    worker = args.worker
    only_repos = args.only_repos
    only_nbs = args.only_nbs
    local = args.local

    # If both flags are specified, ignore.
    if only_repos and only_nbs:
        raise Exception(
            "Cannot use both --repos and --notebooks flags. Use --help flag for more information."
        )

    # If a worker was specified, get partition data and correct header.
    if worker != None:
        print("Worker {0}".format(worker))

        try:
            if local:
                with open("download_partitions.pickle", "rb") as f:
                    partitions_download = pickle.load(f)
                partition = partitions_download[worker]
            else:
                obj = s3.Object(
                    "notebook-research",
                    "download_partitions_{0}.pickle".format(worker))
                partition = pickle.load(BytesIO(obj.get()["Body"].read()))
        except Exception:
            print(("Download Partitions data were not found {0}. ".format(
                "locally" if local else "in s3") +
                   "Please run parallelize_download.py and try again."))
            sys.exit(0)

        notebooks1 = partition["notebooks"]

        obj = s3.Object("notebook-research", "csv/owners1.csv")
        owners = pd.read_csv(BytesIO(obj.get()["Body"].read()))

        obj = s3.Object("notebook-research", "csv/repos1.csv")
        repos = pd.read_csv(BytesIO(obj.get()["Body"].read()))

        owners1 = notebooks1[["owner_id"
                              ]].merge(owners[['owner_id', 'owner_login']],
                                       on="owner_id",
                                       how='left').drop_duplicates()
        repos1 = notebooks1[["repo_id",
                             'owner_id']].merge(repos[['repo_id',
                                                       'repo_name']],
                                                on='repo_id',
                                                how='left').drop_duplicates()
        header = HEADERS[partition["id"]]

        debug_print("Partition data for downloads found and opened. " +
                    "Notebooks1, Owners1, and Repos1 were found and opened." +
                    BREAK)

    # If a worker was not specified, get all data and use first header.
    else:
        try:
            if local:
                notebooks1 = pd.read_csv("{0}/notebooks1.csv".format(PATH))
                owners1 = pd.read_csv("{0}/owners1.csv".format(PATH))
                repos1 = pd.read_csv("{0}/repos1.csv".format(PATH))
            else:
                notebooks1 = s3_to_df("csv/notebooks1.csv")
                owners1 = s3_to_df("csv/owners1.csv")
                repos1 = s3_to_df("csv/repos1.csv")
        except Exception:
            print("The files 'notebooks1.csv','repos1.csv', and " +
                  "'owners1.csv' were not found. Please run query_git.py " +
                  "and try again.")
            sys.exit(0)

        header = HEADERS[0]

    # Check time and display status.
    print("{0} notebooks, {1} repos, {2} owners".format(
        len(notebooks1), len(repos1), len(owners1)))
    check1 = datetime.datetime.now()
    write_to_log("../logs/timing.txt",
                 "download CHECKPOINT 1: {0}".format(check1))

    # Download full notebooks from github.
    if not only_repos:
        if local:
            current_files = set(os.listdir("../data/notebooks"))
        else:
            obj = s3.Object("notebook-research", "current_notebooks.pickle")
            current_files = pickle.load(BytesIO(obj.get()["Body"].read()))

        num_done = len(current_files)
        debug_print(
            "{0} notebooks have already been downloaded.".format(num_done))

        download_nbs(notebooks1, local, current_files)
        check2 = datetime.datetime.now()
        write_to_log("../logs/timing.txt", "CHECKPOINT 2: {0}".format(check2))
        debug_print("\nNotebooks have been downloaded. Time: {0}{1}".format(
            check2 - check1, BREAK))

    # Download data from github.
    if not only_nbs:
        download_repo_data(repos1, owners1, header, local)
        check3 = datetime.datetime.now()
        write_to_log("../logs/timing.txt", "CHECKPOINT 3: {0}".format(check3))
        debug_print("\nRepos have been downloaded. " +
                    "Time: {0}{1}".format(check3 - check2, BREAK))
Пример #4
0
def download_repo_data(repos, owners, header, local):
    """ Download repository metadata files from GitHub. """
    if len(repos) == 0 or len(owners) == 0:
        return

    data_frame = repos.merge(owners, on="owner_id")

    # List files already downloaded.
    current_repos = os.listdir("../data/repos") if local else list_s3_dir(
        "repos/")

    debug_print(("There are currently {0} repo metadata files saved.").format(
        len(current_repos)))

    num_recorded_since = 0
    num_tried_since = 0
    hit_url = ''

    for i, row in data_frame.iterrows():
        num_tried_since += 1

        # Keep track of the download progress.
        if i % COUNT_TRIGGER == 0 or i == len(data_frame):
            debug_print("{0} / {1} repos downloaded.".format(
                i, len(data_frame)))

        # Download repository metadata.
        repo_recorded = False
        if "repo_{0}.json".format(row["repo_id"]) not in current_repos:
            wait_time = 0
            while not repo_recorded:
                time.sleep(wait_time)
                date_string = datetime.datetime.now().strftime(
                    r"%Y-%m-%d %H:%M:%S")
                url = "https://api.github.com/repos/{0}/{1}".format(
                    row["owner_login"], row["repo_name"])
                try:
                    # Query the api.
                    r = requests.get(url, headers=header)
                    j = r.json()
                    h = r.headers

                    # Handle rate limiting.
                    if h["Status"] == "403 Forbidden":
                        debug_print(
                            "{0}: Hit rate limit. Retry at {1}. {2} tried and {3} saved since last hit."
                            .format(h["Date"],
                                    time.ctime(int(h["X-RateLimit-Reset"])),
                                    num_tried_since, num_recorded_since))
                        if hit_url == url:
                            print('Same one again, skipping')
                            repo_recorded = True
                            continue

                        wait_time = int(
                            h["X-RateLimit-Reset"]) - time.time() + 1
                        num_tried_since = 0
                        num_recorded_since = 0
                        hit_url = url
                        continue

                    if "message" in j and (j["message"] == "Not Found" or
                                           j["message"] == "Bad credentials"):
                        print(url, 'Message:', j['message'])
                        raise Exception

                    # Save JSON File.
                    else:
                        if local:
                            filename = "../data/repos/repo_{0}.json".format(
                                row["repo_id"])
                            with open(filename, "w") as repo_file:
                                json.dump(j, repo_file)
                        else:
                            obj = s3.Object(
                                "notebook-research",
                                "repos/repo_{0}.json".format(row["repo_id"]))
                            obj.put(Body=bytes(json.dumps(j).encode("UTF-8")))

                        # Report Status.
                        msg = "{0}: downloaded repo {1}".format(
                            date_string, row["repo_id"])
                        write_to_log("../logs/repo_metadata_query_log.txt",
                                     msg)
                        repo_recorded = True
                        wait_time = 0
                        num_recorded_since += 1

                except Exception as e:
                    # Report missed files.
                    msg = "{0}: had trouble downloading repo {1}".format(
                        date_string, row["repo_id"])
                    write_to_log("../logs/repo_metadata_query_log.txt", msg)
                    debug_print(msg)
                    debug_print(e)
                    repo_recorded = True
Пример #5
0
def save_page(
    url, size, header, query_status, 
    saved_urls, current_notebooks
):
    """ Save results page to json file. """
    
    # Set inital rate limiting management variables.
    limit_status = {
        "reset_time": time.time(),
        "limited": False,
        "wait_time": 0,
        "remaining_queries": 30
    }
    
    # Query GitHub API.
    try:
        r = requests.get(url, headers = header)
        j = r.json()
        h = r.headers
    except requests.exceptions.Timeout:
        debug_print("Request timeout.")
        r = None
        limit_status["limited"] = True
        limit_status["wait_time"] = 60
        return r, limit_status, query_status

    # Handle 403 error if we have hit query rate.
    if "Status" not in h or h["Status"] == "403 Forbidden":
        try:
            debug_print(
                "{0}: Hit rate limit. Retry after {1} seconds".format(
                    h["Date"], 
                    h["Retry-After"]
                )
            )

            # Set to limited and update wait time.
            limit_status["limited"] = True
            limit_status["wait_time"] = int(h["Retry-After"])

        except Exception:
            # Default wait time to 1 minute.
            limit_status["limited"] = True
            limit_status["wait_time"] = 60

        return r, limit_status, query_status

    # Update rate limiting management variables.
    date = r.headers["Date"]
    query_status["num_results"] = int(j["total_count"])
    limit_status["remaining_queries"] = h["X-RateLimit-Remaining"]
    limit_status["reset_time"] = int(h["X-RateLimit-Reset"])

    # Write progress to log and display status.
    log_string = "{0}: {1} bytes {2} results".format(
        date, size, query_status["num_results"]
    )
    write_to_log("../logs/nb_metadata_query_log.txt", log_string)
    debug_print(log_string)

    # Check if query result is acceptable.
    if (
        query_status["num_results"] <= QUERY_CUTOFF or 
        query_status["page"] > 1 or
        size.split("..")[0] == size.split("..")[1]
    ):
        
        # Add days since.
        diff = datetime.datetime.now() - datetime.datetime(2019,1,1)
        j["days_since"] = (diff.days 
            + (diff.seconds + diff.microseconds/(10**6))/(60*60*24)
        )
        
        # Save this page.
        filename = "github_notebooks_{0}_p{1}.json".format(
            size, query_status["page"]
        )    
        if query_status["updating"]:
            filename = "github_notebooks_{0}_p{1}_{2}.json".format(
                size, query_status["page"], datetime.datetime.now()
            )

        if query_status["local"]:
            with open(JSON_PATH+filename, "w") as json_file:
                json.dump(j, json_file)
        else:
            obj = s3.Object("notebook-research","json/"+filename)
            obj.put(Body = bytes(json.dumps(j).encode("UTF-8")))
        
        # Display status.
        debug_print("Saved {0} bytes, p{1}".format(size, query_status["page"]))

        for item in j["items"]:
            # If updating, done if this html_url has already been downloaded.
            if query_status["updating"] and "file" in item:
                html_url = item["html_url"].replace("#","%23")
                file_name = item["file"]
                # If the same version of an existing notebook, done.
                if html_url in saved_urls:
                    debug_print(("This notebook has already been "
                        "downloaded! Stop looking here.")
                    )
                    query_status["another_page"] = False
                    query_status["done"] = True
                    break
                # If new version of an existing notebook, delete old.
                elif file_name in current_notebooks:
                    if query_status["local"]:
                        os.remove("../data/notebooks/{0}".format(file_name))
                    else:
                        s3.Object(
                            "notebook-research",
                            "notebooks/{0}".format(file_name)
                        ).delete()

            # If we"ve retrieved num_results notebooks, we"re done.
            path = item["repository"]["full_name"] + "/" + item["path"]
            query_status["all_items"].append(path)
            if len(query_status["all_items"]) == query_status["num_results"]:
                query_status["another_page"] = False
                query_status["done"] = True
                break

            query_status["num_needed"] += 1

        # Write progress to log adn display 
        log_string = "{0}: {1} bytes p{2} {3} items".format(
            date, size, query_status["page"], len(j["items"])
        )
        write_to_log("../logs/nb_metadata_query_log.txt", log_string)
        debug_print(log_string)

        # if less than 100 items on the page, it"s the last page
        # at most 10 pages
        if len(j["items"]) < 100 or query_status["page"] == 10:
            query_status["done"] = True

    return r, limit_status, query_status
Пример #6
0
def main():
    # Parse command line arguments.
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "min", type=int, 
        help="Minimum size to search."
    )
    parser.add_argument(
        "max", type=int, 
        help="Maximum size to search."
    )
    parser.add_argument(
        "--update", action="store_const", 
        dest="updating", const=True, default=False, 
        help=(
            "Search notebooks that have been added "
            + "or updated since last search, along with new "
            + "notebooks"
        )
    )
    parser.add_argument(
        "--local", action="store_const", 
        dest="local", const=True, default=False, 
        help="Saves output locally instead of in S3."
    )
    parser.add_argument(
        "--worker", metavar="N", type=int, 
        help=(
            "GITHUB_TOKEN assigned to these sizes (workers "
            + "sorted in alphabetical order: {0}).".format(
                list(TOKENS.keys())
            )
        )
    )
    args = parser.parse_args()
    MIN = args.min
    MAX = args.max
    updating = args.updating
    worker = args.worker
    local = args.local

    # If updating, look at saved_urls to determine a duplicate.
    # New versions of notebooks will overwrite earlier downloads.
    saved_urls = []
    current_csvs = os.listdir(PATH) if local else list_s3_dir('csv')
    if updating and "notebooks1.csv" in current_csvs:
        if local:
            notebooks1 = pd.read_csv("{0}/notebooks1.csv".format(PATH))
        else:
            notebooks1 = s3_to_df('csv/notebooks1.csv')
        saved_urls = list(notebooks1.html_url)

    # Set worker.
    if worker != None:
        header = HEADERS[worker]
    else:
        header = HEADERS[0]

    # Log and display status.
    write_to_log(
        "../logs/timing.txt", 
        "Testing on the size range {0} to {1}".format(MIN, MAX)
    )
    start = datetime.datetime.now()
    write_to_log("../logs/timing.txt", "START: {0}".format(start))
    debug_print(
        BREAK 
        + "Downloading and formatting data for all Jupyter "
        + "Notebooks publicly stored on github." 
        + BREAK
    )        

    # List notebooks already downloaded.
    current_notebooks = set(notebooks1.file) if updating else []

    # Get json query files for given size range.
    num_needed = get_json(MIN, MAX, saved_urls, header, 
                          updating, local, current_notebooks)

    if worker != None:
        with open('num_needed_{0}.save'.format(worker),'w') as f:
            f.write(str(num_needed))
    else:
        command = 'nohup python3 -u process.py --needed {0}'.format(num_needed)
        if updating:
            command += ' --updating'
        if local:
            command += ' --local'
        
        os.system(command + ' > process.log &')

    # Check time, log, and display status.
    check1 = datetime.datetime.now()
    write_to_log("../logs/timing.txt", "CHECKPOINT 1: {0}".format(check1))
    debug_print(
        "\nJson query files have been downloaded. "
        + "Time: {0}{1}".format(check1 - start, BREAK)
    )
    
    # Check time, log, and display status.
    check2 = datetime.datetime.now()
    write_to_log("../logs/timing.txt","CHECKPOINT 2: {0}".format(check2))
        
    debug_print("All together, {0}".format(check2 - start))
Пример #7
0
def query(url, size, header, 
        saved_urls, updating, 
        local, current_notebooks):
    """ 
    Query GitHub for notebooks of a given size and return query status.
    """

    # Set inital rate limiting management variables.
    limit_status = {
        "reset_time": time.time(),
        "limited": False,
        "wait_time": 0,
        "remaining_queries": 30
    }
    
    # Set initial query status variables.
    query_status = {
        "done": False,
        "page": 1,
        "another_page": False,
        "updating": updating,
        "local": local,
        "num_results": 0,
        "num_needed": 0,
        "all_items": []
    }

    while not query_status["done"]:
        # Handle rate limiting status.
        limit_status = check_limit(limit_status)

        # Save this page of results.
        r, limit_status, query_status = save_page(
            url, size, header, query_status,
            saved_urls, current_notebooks
        )
        if r == None:
            continue

        # If too many results, return. Handled in get_json.
        if (query_status["num_results"] > QUERY_CUTOFF and  # Too many results.
            size.split("..")[0] != size.split("..")[1]      # Can decrease query range (min!=max).
        ):
            query_status["done"] = True
            return query_status

        # Handle rate limiting status.
        if limit_status["limited"] and limit_status["wait_time"] != 0:
            continue
            
        # Move to the next page of results.
        if "next" in r.links:
            next_url = r.links["next"]["url"]
            query_status["another_page"] = True
            
            while (
                query_status["another_page"] and 
                len(query_status["all_items"]) != query_status["num_results"]
            ):
                query_status["page"] += 1
                debug_print("{0} to find, {1} found, {2} unique".format(
                    query_status["num_results"], 
                    len(query_status["all_items"]), 
                    len(set(query_status["all_items"]))
                ))
            
                # Handle rate limiting status.
                limit_status = check_limit(limit_status)

                # Save this page of results.
                r, limit_status, query_status = save_page(
                    next_url, size, header, query_status, 
                    saved_urls, current_notebooks
                )
                if r == None:
                    continue

                # Handle rate limiting status.
                if limit_status["limited"] and limit_status["wait_time"] != 0:
                    query_status["page"] -= 1
                    continue

                if "next" in r.links:
                    # Move on to next page of results.
                    next_url = r.links["next"]["url"]
                else:
                    # Completed last page of results.
                    query_status["another_page"] = False

        query_status["done"] = True

        # Report if too many results within a single size (e.g. 1200..1200).
        if (
            query_status["num_results"] > QUERY_CUTOFF and 
            size.split("..")[0] == size.split("..")[1]
        ):
            msg = "TOO MANY RESULTS: {0} bytes, {1} results".format(
                size.split("..")[0],
                query_status["num_results"]
            )
            write_to_log("../logs/nb_metadata_query_log.txt", msg)
            debug_print(msg)
    
    return query_status
Пример #8
0
def get_all_nb_cells(notebooks, local, done):
    """ Get cell and notebook data for each notebook. """
    new_nb_info = {}
    all_cells_info = {}
    missing = []

    for count, row in notebooks.iterrows():
        # Track progress.
        file_name = row["file"]
        data = None
        if count % COUNT_TRIGGER == 0 or count == len(notebooks) - 1:
            print("{0} / {1} notebooks processed for cell data".format(
                count,
                len(notebooks) + done))

            # Save data and reset. (In chunks to avoid MemoryError).
            if count > 0:
                # Transform data to DataFrame.
                notebooks_temp = pd.DataFrame(new_nb_info).transpose()
                cells_temp = pd.DataFrame(
                    all_cells_info).transpose().reset_index(drop=True)

                # Save data to CSV.
                try:
                    if local:
                        notebooks_temp.to_csv(
                            "{0}/notebooks2_{1}_{2}.csv".format(
                                PATH, EXTENSION, count / COUNT_TRIGGER),
                            index=False)
                        cells_temp.to_csv("{0}/cells1_{1}_{2}.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                          index=False)
                    else:
                        df_to_s3(
                            notebooks_temp,
                            "{0}/notebooks2_{1}_{2}.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            cells_temp, "{0}/cells1_{1}_{2}.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                except MemoryError:
                    # Split data into 3 sections and try saving again.
                    n1 = notebooks_temp.iloc[:len(notebooks_temp) // 4]
                    n2 = notebooks_temp.iloc[len(notebooks_temp) // 4:2 *
                                             len(notebooks_temp) // 4]
                    n3 = notebooks_temp.iloc[2 * len(notebooks_temp) // 4:3 *
                                             len(notebooks_temp) // 4]
                    n4 = notebooks_temp.iloc[3 * len(notebooks_temp) // 4:]

                    c1 = cells_temp.iloc[:len(cells_temp) // 8]
                    c2 = cells_temp.iloc[len(cells_temp) // 8:2 *
                                         len(cells_temp) // 8]
                    c3 = cells_temp.iloc[2 * len(cells_temp) // 8:3 *
                                         len(cells_temp) // 8]
                    c4 = cells_temp.iloc[3 * len(cells_temp) // 8:4 *
                                         len(cells_temp) // 8]
                    c5 = cells_temp.iloc[4 * len(cells_temp) // 8:5 *
                                         len(cells_temp) // 8]
                    c6 = cells_temp.iloc[5 * len(cells_temp) // 8:6 *
                                         len(cells_temp) // 8]
                    c7 = cells_temp.iloc[6 * len(cells_temp) // 8:7 *
                                         len(cells_temp) // 8]
                    c8 = cells_temp.iloc[7 * len(cells_temp) // 8:]

                    if local:
                        n1.to_csv("{0}/notebooks2_{1}_{2}_1.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n2.to_csv("{0}/notebooks2_{1}_{2}_2.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n3.to_csv("{0}/notebooks2_{1}_{2}_3.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        n4.to_csv("{0}/notebooks2_{1}_{2}_4.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)

                        c1.to_csv("{0}/cells1_{1}_{2}_1.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c2.to_csv("{0}/cells1_{1}_{2}_2.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c3.to_csv("{0}/cells1_{1}_{2}_3.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c4.to_csv("{0}/cells1_{1}_{2}_4.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c5.to_csv("{0}/cells1_{1}_{2}_5.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c6.to_csv("{0}/cells1_{1}_{2}_6.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c7.to_csv("{0}/cells1_{1}_{2}_7.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                        c8.to_csv("{0}/cells1_{1}_{2}_8.csv".format(
                            PATH, EXTENSION, count / COUNT_TRIGGER),
                                  index=False)
                    else:
                        df_to_s3(
                            n1, "{0}/notebooks2_{1}_{2}_1.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n2, "{0}/notebooks2_{1}_{2}_2.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n3, "{0}/notebooks2_{1}_{2}_3.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            n4, "{0}/notebooks2_{1}_{2}_4.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                        df_to_s3(
                            c1, "{0}/cells1_{1}_{2}_1.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c2, "{0}/cells1_{1}_{2}_2.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c3, "{0}/cells1_{1}_{2}_3.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c4, "{0}/cells1_{1}_{2}_4.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c5, "{0}/cells1_{1}_{2}_5.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c6, "{0}/cells1_{1}_{2}_6.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c7, "{0}/cells1_{1}_{2}_7.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))
                        df_to_s3(
                            c8, "{0}/cells1_{1}_{2}_8.csv".format(
                                S3_PATH, EXTENSION, count / COUNT_TRIGGER))

                # Empty current dictionaries.
                new_nb_info = {}
                all_cells_info = {}
                print("CSVs saved")

        # Initialize row of data.
        nb_info = {
            "file": file_name,
            "google_collab": False,
            "nbformat": "",
            "nbformat_minor": "",
            "num_cells": 0,
            "kernel_lang": "",
            "kernel_name": "",
            "lang_name": "",
            "lang_version": ""
        }

        # Open notebooks as json.
        try:
            obj = s3.Object("notebook-research",
                            "notebooks/{0}".format(file_name))
            data = json.loads(obj.get()["Body"].read().decode("UTF-8"))
        except Exception:
            # Report missed files.
            msg = "Notebook {0} did not open.".format(file_name)
            write_to_log("../logs/repo_metadata_cleaning_log.txt", msg)
            missing.append(file_name)

            # Add row with missing values.
            if file_name not in new_nb_info:
                new_nb_info[file_name] = nb_info

            continue

        # If data was able to load as JSON, extract information.
        if data and isinstance(data, dict):
            keys = data.keys()

            # Get nb top level format metadata.
            if "nbformat" in keys:
                nb_info["nbformat"] = data["nbformat"]
            if "nbformat_minor" in keys:
                nb_info["nbformat_minor"] = data["nbformat_minor"]

            # Get info from the metadata dictionary.
            if ("metadata" in keys and data["metadata"] != None
                    and isinstance(data["metadata"], dict)):
                metadata_keys = data["metadata"].keys()

                # Access language data.
                if ("kernelspec" in metadata_keys
                        and data["metadata"]["kernelspec"] != None
                        and isinstance(data["metadata"]["kernelspec"], dict)):
                    kernel_keys = data["metadata"]["kernelspec"].keys()

                    # If Google colab notebook, only Python 2.7 or 3.6 are possible.
                    if "colab" in metadata_keys:
                        nb_info["google_collab"] = True
                        if ("name" in kernel_keys
                                and "display_name" in kernel_keys):
                            nb_info["kernel_lang"] = data["metadata"][
                                "kernelspec"]["name"]
                            nb_info["kernel_name"] = data["metadata"][
                                "kernelspec"]["display_name"]
                            if nb_info["kernel_lang"] == "python3":
                                nb_info["lang_name"] = "python"
                                nb_info["lang_version"] = "3.6"
                            elif nb_info["kernel_lang"] == "python2":
                                nb_info["lang_name"] = "python"
                                nb_info["lang_version"] = "2.7"

                    # Not Google colab, access kernel language and display name.
                    else:
                        if "language" in kernel_keys:
                            nb_info["kernel_lang"] = data["metadata"][
                                "kernelspec"]["language"]
                        if "display_name" in kernel_keys:
                            nb_info["kernel_name"] = data["metadata"][
                                "kernelspec"]["display_name"]

                # Access language info.
                if ("language_info" in metadata_keys
                        and "colab" not in metadata_keys):
                    lang_keys = data["metadata"]["language_info"].keys()
                    if "name" in lang_keys and "colab" not in metadata_keys:
                        nb_info["lang_name"] = data["metadata"][
                            "language_info"]["name"]
                    if "version" in lang_keys and "colab" not in metadata_keys:
                        nb_info["lang_version"] = data["metadata"][
                            "language_info"]["version"]
                elif "language" in metadata_keys:
                    nb_info["lang_name"] = data["metadata"]["language"]

            # Get information about individual cells.
            cells_info = {}
            if "cells" in keys:
                nb_info["num_cells"] = len(data["cells"])
                cell_id = 0
                for cell in data["cells"]:
                    cell_info, nb_language = get_single_cell(
                        cell_id, file_name, cell, nb_info["lang_name"])

                    if nb_info["lang_name"] == "":
                        nb_info["lang_name"] = nb_language.lower()

                    if (file_name, cell_id) not in cells_info:
                        cells_info[(file_name, cell_id)] = cell_info

                    cell_id += 1

            elif "worksheets" in keys:
                cell_id = 0
                for w in data["worksheets"]:
                    for cell in w["cells"]:
                        cell_info, nb_language = get_single_cell(
                            cell_id, file_name, cell, nb_info["lang_name"])

                        if nb_info["lang_name"] == "":
                            nb_info["lang_name"] = nb_language.lower()

                        if (file_name, cell_id) not in cells_info:
                            cells_info[(file_name, cell_id)] = cell_info

                        cell_id += 1

        all_cells_info.update(cells_info)

        if file_name not in new_nb_info:
            new_nb_info[file_name] = nb_info

    debug_print("{0} notebooks are missing cell data.".format(len(missing)))
    return new_nb_info, all_cells_info
Пример #9
0
def update_owners_repos(owners, repos, local):
    """ Add information on Owners and Repos"""

    new_repo_info = {}
    new_owner_info = {}
    repo_ids = list(repos.repo_id)
    missing = 0
    forked = 0
    moved = 0

    for i, repo_id in enumerate(repo_ids):
        repo_json = None

        # Keep track of progress.
        if i % COUNT_TRIGGER == 0:
            debug_print("{0} / {1} repo data files processed".format(
                i, len(repo_ids)))

        try:
            obj = s3.Object("notebook-research",
                            "repos/repo_{0}.json".format(repo_id))
            repo_json = json.loads(obj.get()["Body"].read().decode("UTF-8"))

        except Exception:
            missing += 1
            # Report missed files.
            msg = "Repo {0} metadata did not process.".format(repo_id)
            write_to_log("../logs/repo_metadata_cleaning_log.txt", msg)
            continue

        if repo_json != None:
            if "message" in repo_json and (repo_json["message"] == "Not Found"
                                           or repo_json["message"]
                                           == "Bad credentials"):
                # Report missed files.
                missing += 1
                msg = "Repo {0} metadata file did not download well.".format(
                    repo_id)

                # Move bad file
                s3.Object(
                    'notebook-research',
                    'repos_bad/repo_{0}.json'.format(repo_id)).copy_from(
                        CopySource='notebook-research/repos/repo_{0}.json'.
                        format(repo_id))
                s3.Object('notebook-research',
                          'repos/repo_{0}.json'.format(repo_id)).delete()
                moved += 1

                write_to_log("../logs/repo_metadata_cleaning_log.txt", msg)
                continue

            if "owner" in repo_json:
                owner_id = repo_json["owner"]["id"]
            else:
                # Report missed files.
                msg = "Repo {0} metadata file not complete.".format(repo_id)
                write_to_log("../logs/repo_metadata_cleaning_log.txt", msg)
                continue

            if not repo_json["fork"]:
                # Add repo info.
                repo_info = {
                    "repo_id": repo_id,
                    "language": repo_json["language"],
                    "forks_count": repo_json["forks_count"],
                    "stargazers_count": repo_json["stargazers_count"],
                    "watchers_count": repo_json["watchers_count"],
                    "subscribers_count": repo_json["subscribers_count"],
                    "size": repo_json["size"],
                    "open_issues_count": repo_json["open_issues_count"],
                    "has_issues": repo_json["has_issues"],
                    "has_wiki": repo_json["has_wiki"],
                    "has_pages": repo_json["has_pages"],
                    "has_downloads": repo_json["has_downloads"],
                    "pushed_at": repo_json["pushed_at"],
                    "created_at": repo_json["created_at"],
                    "updated_at": repo_json["updated_at"]
                }
                if repo_id not in new_repo_info:
                    new_repo_info[repo_id] = repo_info

                # Add owner info
                owner_info = {
                    "owner_id": owner_id,
                    "type": repo_json["owner"]["type"]
                }
                if owner_id not in new_owner_info:
                    new_owner_info[owner_id] = owner_info
            else:
                forked += 1
        else:
            missing += 1

    # Display status.
    debug_print("We have {0} new repos.".format(len(new_repo_info)))
    debug_print("Couldn't process {0} files.".format(missing))
    debug_print("{0} new repos were forked.".format(forked))
    debug_print("{0} files had to be moved".format(moved))

    # Translate dictionaries to DataFrames.
    if len(new_owner_info) > 0 and len(new_repo_info) > 0:
        updated_owners = owners.merge(
            pd.DataFrame(new_owner_info).transpose().reset_index(drop=True),
            on="owner_id")
        updated_repos = repos.merge(
            pd.DataFrame(new_repo_info).transpose().reset_index(drop=True),
            on="repo_id")
    else:
        updated_owners = []
        updated_repos = []

    return updated_owners, updated_repos