def get_raw_data(data_params): """ This function collates all scraping logic :param: data_params A dictionary containing all data parameters. The only one used is the location at which to save raw data, and the states to scrape """ # store raw data here raw_data = [] # iterate over every state for state, url in data_params["states"]: all_routes = find_all_routes_in_area(url) # for every route in the state, get the route data for route_url in tqdm(all_routes): route_data = get_route_data(route_url) if route_data: raw_data.append(route_data) # save the raw data with open( make_absolute(data_params["raw_data_folder"] + state + ".json"), "w") as f: json.dump(raw_data, f) # after saving the raw data, clear the raw data list raw_data = []
def run_data(config): """ This file downloads and saves raw and processed data to the folder "data/raw/" and "data/processed/" by default. These save locations can be changed in the config file :param: config The config file. Default settings can be found in "config/default.json" """ # the folders in which to save data data_path = make_absolute(config["data_path"]) raw_path = os.path.join(data_path, config["raw_folder"]) adj_close_path = os.path.join(raw_path, config["adj_close_folder"]) iv_path = os.path.join(raw_path, config["iv_folder"]) processed_path = os.path.join(data_path, config["processed_folder"]) # delete then recreate the data folders # this is to completly overwrite all data if it exists shutil.rmtree(data_path, ignore_errors=True) os.mkdir(data_path) os.mkdir(raw_path) os.mkdir(adj_close_path) os.mkdir(iv_path) os.mkdir(processed_path) # download raw data download_data(config) # process the data process_data(config)
def download_data(config): """ Download the raw data needed for every ticker :param: config The config file """ # the folders in which to save data data_path = make_absolute(config["data_path"]) raw_path = os.path.join(data_path, config["raw_folder"]) adj_close_path = os.path.join(raw_path, config["adj_close_folder"]) iv_path = os.path.join(raw_path, config["iv_folder"]) # initialize quandl with the api key quandl.ApiConfig.api_key = os.environ["QUANDL_API_KEY"] print("Downloading data...") # store iv metadata here iv_metadata = [] # iterate over each ticker for ticker in tqdm(config["tickers"]): # this try except should catch tickers that do not exist in Quandl's EOD database try: # for each ticker download and save adj_close data data = get_ticker_adj_close(ticker) data.to_csv(os.path.join(adj_close_path, ticker + ".csv"), index=False) # for each ticker get iv data/metadata # save data and store metadata data, metadata = get_ticker_iv(ticker) data.to_csv(os.path.join(iv_path, ticker + ".csv"), index=False) iv_metadata.append(metadata) except quandl.errors.quandl_error.NotFoundError as e: error_str = f"Ticker {ticker} does not exist in Quandl's EOD database. It " + \ "will be removed for the rest of the current run." # log the error if (config["log"]): log(error_str) # print out an error statement print() print(error_str) # remove the ticker from the config file config["tickers"].remove(ticker) # save metadata iv_metadata = pd.DataFrame(iv_metadata, columns=[ "ticker", "next_earnings_day", "trading_days", "calendar_days", "crush_rate" ]) iv_metadata.to_csv(iv_path + "metadata.csv", index=False) print("Done\n")
def get_clean_data(data_params): """ This function collates all cleaning logic :param: data_params A dictionary containing all data parameters. The only ones used are the location at which to download raw data and the location at which to save clean data """ # iterate over every state # it is assumed that data is saved and named accoring to get_raw_data.py for state in data_params["state"]: # get the url at which raw data will be found raw_data_path = make_absolute(data_params["raw_data_folder"] + state + ".json") print(raw_data_path) # get the data with open(raw_data_path, "r") as f: raw_data = json.load(f) # store all clean data as a list of lists # note that the first input row is the column names climb_data = [["climb_id", "name", "description", "image_url", "latitude", "longitude", "avg_rating", "num_ratings", "url", "climb_type", "height_ft", "height_m", "pitches", "grade", "protection", "difficulty", 'rock_climb', 'boulder_climb']] #user_data = [["user_id", "climb_id", "rating"]] # process the data for climb in raw_data: # get the climb/user data and add it to the list of lists climb_row = split_into_user_climb(climb) climb_data.append(climb_row) #for user_row in user_rows: # user_data.append(user_row) # save the lists of lists as csv data in the proper location clean_data_path = str(make_absolute(data_params["clean_data_folder"])) + "/" with open(clean_data_path + state + "_climbs.csv", "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerows(climb_data)
def run_sheet(config): """ This file collates the logic needed to put processed data into the sheet :param: config The config file :return: str The name of the wb saved """ # the folders in which to get data data_path = make_absolute(config["data_path"]) processed_path = os.path.join(data_path, config["processed_folder"]) xl_path = make_absolute(config["save_location"]) # create the wb with the proper pages for each sheet wb = get_workbook(config["tickers"]) # add short term data to the wb # get the short term metadata metadata = pd.read_csv(os.path.join(processed_path, "metadata.csv")) # iterate over every ticker with short term data print("Adding short term data...") for ticker in tqdm(config["tickers"]): # get the short term data for this ticker data = pd.read_csv(os.path.join(processed_path, ticker + ".csv")) # add the short term data and metadata to the wb add_short_term_sheet(ticker, wb[ticker], data, metadata[metadata["ticker"] == ticker], config) print("Done\n") print("Adding long term data...") # add all long term sheets add_long_term_sheets(wb, config) print("Done\n") # at the end save the sheet return save(wb, xl_path)
def process_data(config): """ This function contains the logic to change raw data into processed data, as described in the file header :param: config The config file """ print("Processing Data...") # the folders in which to download/save data data_path = make_absolute(config["data_path"]) raw_path = os.path.join(data_path, config["raw_folder"]) adj_close_path = os.path.join(raw_path, config["adj_close_folder"]) iv_path = os.path.join(raw_path, config["iv_folder"]) processed_path = os.path.join(data_path, config["processed_folder"]) # store short term data statistics here short_term_stats = pd.DataFrame(columns=[ "ticker", "n", "mean", "20 Day STD", "40 Day STD", "60 Day STD" ]) # iterate over all the raw data for ticker in config["tickers"]: # open the adj_close csv file and select the most recent 60 rows os.path.join(adj_close_path, ticker + ".csv") data = pd.read_csv(os.path.join(adj_close_path, ticker + ".csv"))[:60].reset_index(drop=True) # compute the various short term data stats n = 60 mean = data["Adj_Close"].mean() std_20 = data["Adj_Close"][-20:].std() std_40 = data["Adj_Close"][-40:].std() std_60 = data["Adj_Close"].std() # add the short term stats to the df short_term_stats.loc[len(short_term_stats.index)] = [ ticker, n, mean, std_20, std_40, std_60 ] # open the iv csv file and add the columns to data iv_data = pd.read_csv(os.path.join(iv_path, ticker + ".csv")) data[["IV30 %", "IV30 Rank", "IV30 Rating" ]] = iv_data[["Iv30Percentile", "Iv30Rank", "Iv30Rating"]] # save the combined columns to a csv file data.to_csv(os.path.join(processed_path, ticker + ".csv"), index=False) # add short term data statistics to the metadata then save metadata = pd.read_csv(os.path.join(iv_path, "metadata.csv")) metadata = metadata.merge(short_term_stats, on="ticker", how="inner") metadata.to_csv(os.path.join(processed_path, "metadata.csv"), index=False) # then compute and save the long term data # create the structure to hold the data percent_change = [] std = [] freq = [] # get the percent change for each ticker for ticker in config["tickers"]: data = pd.read_csv(os.path.join(adj_close_path, ticker + ".csv"), parse_dates=["Date"]) # get the percentage change for every month in the last 10 years for the ticker monthly = get_monthly_for_stock(data) # add the various values to the proper lists percent_change.append([ticker] + monthly[0]) std.append([ticker] + monthly[1]) freq.append([ticker] + monthly[2]) # change the lists of lists to dfs columns = [ "Ticker", "Jan (1)", "Feb (2)", "Mar (3)", "Apr (4)", "May (5)", "Jun (6)", "Jul (7)", "Aug (8)", "Sep (9)", "Oct (10)", "Nov (11)", "Dec (12)" ] percent_change = pd.DataFrame(percent_change, columns=columns) std = pd.DataFrame(std, columns=columns) freq = pd.DataFrame(freq, columns=columns) # save the long term data percent_change.to_csv(os.path.join(make_absolute(processed_path), "perc.csv"), index=False) std.to_csv(os.path.join(make_absolute(processed_path), "std.csv"), index=False) freq.to_csv(os.path.join(make_absolute(processed_path), "freq.csv"), index=False) print("Done\n")
def add_long_term_sheet(wb, config, file_name, sheet_name, sheet_location, avg_function): """ Add a specific long term sheet using the data from the file_name parameter. :param: wb The workbook to add the sheet to :param: config The config file. By default "config/default.json" :param: file_name The name (no extension) of the file where data for this sheet can be found. The file_name should also reflect the default settings in "config/default.json" since the coloring looks for the key "{file_name}_low_high" in the config file for how to color :param: sheet_name The name of the sheet to insert :param: sheet_location Where in the workbook to put the sheet :param: avg_function The function used to generate the "4 Month column". This function should take in a series with four elements and return a number (usually float) """ # get the current year and month current_year = datetime.date.today().year current_month = datetime.date.today().month # create the sheet at the correct location sheet = wb.create_sheet(sheet_name, sheet_location) # get the data to fill into the sheet data_path = os.path.join(make_absolute(config["data_path"]), config["processed_folder"], file_name + ".csv") data = pd.read_csv(data_path) # get the correct order of months month_order = get_month_order(current_month) four_month = None if (current_month >= 10): # if the month is Oct, Nov, or Dec, then there will be an empty column placed after Dec, # which will cause issues with the averaging functions. so remove the empty column four_month = month_order.copy()[1:6] four_month.remove("Empty") else: four_month = month_order[1:5] # compute the four month average based on the specific average function data["4 Month"] = data[four_month].apply(avg_function, axis=1) # add the empty column data["Empty"] = "" # reorder the columns data = data[month_order] # add all the data to the sheet for row in dataframe_to_rows(data, index=False, header=True): sheet.append(row) # remove the "Empty" column header # get the letter column it appears in and set value to "" column_letter = integer_to_letter(month_order.index("Empty") + 1) sheet[column_letter + "1"] = "" # format all column headers # don't touch ticker, but set all other headers to center alignment for letter_int in range(2, 16): sheet[integer_to_letter(letter_int) + "1"].alignment = Alignment(horizontal="center") # move all cells down one sheet.move_range("A1:O" + str(1 + len(data.index)), rows=1, cols=0) # merge super header cells and add the years # if the current month is January, then all data comes from the previous year if (current_month == 1): sheet.merge_cells("B1:N1") sheet["B1"] = current_year - 1 # otherwise... else: # merge all cells up to but not including the empty column sheet.merge_cells("B1:" + integer_to_letter(month_order.index("Empty")) + "1") sheet["B1"] = current_year - 1 sheet["B1"].alignment = Alignment(horizontal="center") # merge all cells after the empty column start_cell = integer_to_letter(month_order.index("Empty") + 2) + "1" end_cell = integer_to_letter((month_order.index("Empty") + 2) + (current_month - 2)) + "1" sheet.merge_cells(start_cell + ":" + end_cell) sheet[start_cell] = current_year sheet[start_cell].alignment = Alignment(horizontal="center") # color the sheet # get the starting cell (top left) and ending cell (bottom right) for the coloring start_cell = "B3" end_cell = "O" + str(len(data.index) + 2) # get the low/high color theresholds and the color gradient low_high = config[file_name + "_low_high"] color_gradient = config["color_gradient"] # iterate over every cell for row_of_cells in sheet[start_cell + ":" + end_cell]: for cell in row_of_cells: # color the cell based on the value of the cell, ignoring when the cell is empty if (cell.value != ""): cell.fill = get_color(low_high[0], low_high[1], float(cell.value), color_gradient)