# Question 6b | 1 # Total | 19 # --- # # Part 1: Importing the Data # # We will again use the `fetch_and_cache` utility to download the dataset. # In[3]: # Download the dataset from ds100_utils import fetch_and_cache data_url = 'http://www.ds100.org/sp19/assets/datasets/hw3-realdonaldtrump_tweets.json.zip' file_name = 'hw3-realdonaldtrump_tweets.json.zip' dest_path = fetch_and_cache(data_url=data_url, file=file_name) print(f'Located at {dest_path}') # Now that we've downloaded the tweets, let's unzip them and load them into our notebook. Run the cell below to unzip and read tweets from the json file into a list named `all_tweets`. # In[4]: # Unzip the dataset my_zip = zipfile.ZipFile(dest_path, 'r') with my_zip.open('hw3-realdonaldtrump_tweets.json', 'r') as f: all_tweets = json.load(f) # Here is what a typical tweet from `all_tweets` looks like: # In[5]:
# casual | count of casual users # registered | count of registered users # cnt | count of total rental bikes including casual and registered # ### Download the Data # In[3]: # Run this cell to download the data. No further action is needed data_url = 'https://github.com/DS-100/fa18/raw/gh-pages/assets/datasets/hw2-bikeshare.zip' file_name = 'data.zip' data_dir = '.' dest_path = ds100_utils.fetch_and_cache(data_url=data_url, data_dir=data_dir, file=file_name) print('Saved at {}'.format(dest_path)) zipped_data = zipfile.ZipFile(dest_path, 'r') data_dir = Path('data') zipped_data.extractall(data_dir) print("Extracted Files:") for f in data_dir.glob("*"): print("\t",f) # ### Examining the file contents #
# - data_dir: (default="data") the location to save the data # - force: if true the file is always re-downloaded # # The way this function works is that it checks to see if `data_dir/file` already exists. If it does not exist already or if `force=True`, the file at `data_url` is downloaded and placed at `data_dir/file`. The process of storing a data file for reuse later is called caching. If `data_dir/file` already and exists `force=False`, nothing is downloaded, and instead a message is printed letting you know the date of the cached file. # # The function returns a `pathlib.Path` object representing the location of the file ([pathlib docs](https://docs.python.org/3/library/pathlib.html#basic-use)). # In[399]: import ds100_utils source_data_url = 'http://www.ds100.org/sp19/assets/datasets/proj1-SFBusinesses.zip' target_file_name = 'data.zip' # Change the force=False -> force=True in case you need to force redownload the data dest_path = ds100_utils.fetch_and_cache(data_url=source_data_url, data_dir='.', file=target_file_name, force=False) # After running the cell above, if you list the contents of the directory containing this notebook, you should see `data.zip`. # In[400]: get_ipython().system('ls') # --- # ## 1: Loading Food Safety Data # # We have data, but we don't have any specific questions about the data yet, so let's focus on understanding the structure of the data. This involves answering questions such as: # # * Is the data in a standard format or encoding? # * Is the data organized in records?