def add_features(input_file, output_file, force): """ Runs build features scripts to turn processed data from (../processed) into improved data (saved in ../processed as well). Parameters ---------- input_file: str Input file to be processed output_file: str Output processed file force: bool Force to process the input file """ spinner = Halo(text='Building features...', spinner='dots') clean_data = pd.read_csv(input_file) # Add lat/lon columns if force or not os.path.exists(output_file): spinner.start("Adding Latitude and Longitude columns") transformed_data = apply_nomatin(clean_data) transformed_data.to_csv(output_file, index=False) spinner.succeed("Latitude and Longitude features added!") else: spinner.start("Loading transformed file...") time.sleep(2) transformed_data = pd.read_csv(output_file) spinner.stop_and_persist(text="Transformed file already exists!") # Combine features transformed_data = combine_features(transformed_data) transformed_data.to_csv(output_file, index=False) return transformed_data
def process_dataset(input_file, output_file, scrape): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). Parameters ---------- input_file: str Input file to be processed output_file: str Output processed file scrape: bool Force the scraping process """ spinner = Halo(text='Making dataset...', spinner='dots') logger = logging.getLogger(__name__) logger.info('Making final dataset from raw data') # Scrape data if scrape or not os.path.exists(input_file): spinner.start("Scraping data") with open('./references/urls.txt', 'r') as f: urls = f.readlines() scraped_dfs = [] for url in urls: scraped_dfs.append(navigate(url, 1, 500)) # Save results raw_data = pd.concat(scraped_dfs) raw_data.to_csv(input_file, index=False) spinner.succeed("Data Scrapped!") else: spinner.succeed("Loading scraped file...") raw_data = pd.read_csv(input_file) spinner.succeed("Scraped file already exists!") # Remove duplicates spinner.start("Removing duplicates and invalid values...") time.sleep(1) interim_data = remove_duplicates_and_na(raw_data) interim_data.to_csv(output_file.replace("processed", "interim"), index=False) spinner.succeed("Done removing duplicates!") # Remove outliers spinner.start("Removing outliers and inconsistent values...") time.sleep(1) final_data = remove_outliers(interim_data) final_data.to_csv(output_file, index=False) spinner.succeed("Done removing outliers!") spinner.start("Cleaning processing done!") spinner.stop_and_persist(symbol='✔'.encode('utf-8'), text="Cleaning processing done!") return final_data