示例#1
0
def load_basics():
    return pd.read_csv(
        directories.data("title.basics.tsv.gz"),
        compression="gzip",
        sep="\t",
        low_memory=False,
    )
def run():
    print("Downloading data")
    retrieve_imdb_data("name.basics.tsv.gz")
    retrieve_imdb_data("title.basics.tsv.gz")
    retrieve_imdb_data("title.ratings.tsv.gz")
    retrieve_imdb_data("title.principals.tsv.gz")

    print("Creating graph")
    g = make_professional_graph()
    with open(directories.data("professional.pkl"), "wb") as f:
        pickle.dump(g, f)
def retrieve_imdb_data(filename):
    path = directories.data(filename)
    print(path)
    if os.path.exists(path):
        print("{} already exists".format(path))
        return
    url = "https://datasets.imdbws.com/{}".format(filename)
    print("Down loading {}".format(url))
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get("content-length", 0))
    block_size = 1024
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with open(path, "wb") as f:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
    progress_bar.close()
示例#4
0
def load_principals():
    return pd.read_csv(directories.data("title.principals.tsv.gz"),
                       compression="gzip",
                       sep="\t")
示例#5
0
def load_ratings():
    return pd.read_csv(directories.data("title.ratings.tsv.gz"),
                       compression="gzip",
                       sep="\t")
示例#6
0
def load_names():
    return pd.read_csv(directories.data("name.basics.tsv.gz"),
                       compression="gzip",
                       sep="\t")