def clean_merge_and_save(data_filename='df1.feather', weather_filename='weather_df.feather', save_as=None, raw_dir=utils.path_to('data', 'raw'), interim_dir=utils.path_to('data', 'interim')): """ Read and clean raw data, merge, and save DataFrame [Arg] data_file_path weather_file_path """ # read data df_data = pd.read_feather(os.path.join(raw_dir, data_filename)) df_weather = pd.read_feather(os.path.join(raw_dir, weather_filename)) # call clean function df_data = data.clean_dataframe_p1(df_data) df_weather = data.clean_weather_df(df_weather) # merge data df_concat = data.merge_df1_and_weather(df_data, df_weather) # save data if save_as is not None: feather_filepath = os.path.join(interim_dir, save_as) print('Writing feather file to {}'.format(feather_filepath)) df_concat.to_feather(feather_filepath) return df_concat
def unzipping_zip_files(file_name, unzip_dir=utils.path_to('data', 'external')): file_path = os.path.join(unzip_dir, file_name) with ZipFile(file_path, 'r') as zipObj: # Extract all the contents of zip file in current directory zipObj.extractall(unzip_dir) return (print('file unzipped'))
def read(self): data = [] with open(utils.path_to(self.filename), 'rb') as f: reader = DictReader(f) for row in reader: if not all(v is '' for k, v in row.iteritems()): data.append(row) return data
def test_various(self): result = unittest.TestResult() suite = QUnitSuite(path_to('success.html')) suite(result) self.assertEqual(result.skipped, []) self.assertEqual(result.errors, []) self.assertEqual(result.failures, []) self.assertEqual(result.testsRun, 6)
def test_polyfills(self): result = unittest.TestResult() suite = QUnitSuite(path_to('polyfill.html')) suite(result) self.assertEqual(result.skipped, []) self.assertEqual(result.errors, []) self.assertEqual(result.failures, []) self.assertEqual(result.testsRun, 1)
def execute_query_and_save_df( query_filename, feather_filename = None, *, query_dir = utils.path_to('src', 'data'), feather_dir = utils.path_to('data', 'raw') ): """read SQL query from file, execute query, return pandas dataframe and optionally save pandas dataframe at given file path in feather format Arguments: query_filename (str): name of the query file feather_filename (str, optional): name of the feather file to write Keyword Arguments: feather_dir (str, optional): directory where the feather file is written query_dir (str, optional): directory where the query is stored Returns: pd.DataFrame: pandas dataframe with the query result """ print('Opening database connection') db_connection = sql.connect( host='35.233.4.203', user='******', passwd='ier2rJZte8rt4fGHj2Sfi', database='s2ds' ) query_filepath = os.path.join(query_dir, query_filename) print('Querying database with query in ' + query_filepath) query_string = utils.read_file_as_string(query_filepath) df = pd.read_sql(query_string, con=db_connection) print('Closing database connection') db_connection.close() if feather_filename is not None: feather_filepath = os.path.join(feather_dir, feather_filename) write_feather_file(df, feather_filepath) return df
def test_various(self): result = unittest.TestResult() suite = QUnitSuite(path_to('failure.html')) suite(result) self.assertEqual(result.skipped, []) self.assertEqual(result.errors, []) self.assertEqual(len(result.failures), 10) # used to check messages, but source may be randomly added # based on exact phantomjs version so f**k that self.assertEqual(result.testsRun, 8)
def test_timeout(self): result = unittest.TestResult() # lower timeout to not blow up test suite runtime worse than now suite = QUnitSuite(path_to('timeout.html'), timeout=500) suite(result) self.assertEqual(result.skipped, []) self.assertEqual(result.testsRun, 1) self.assertEqual(result.failures, []) self.assertEqual(len(result.errors), 1) test, message = result.errors[0] self.assertEqual(str(test), "phantomjs: startup") self.assertTrue(message.startswith("PhantomJS timed out"))
def save_model(classifier, save_as, model_dir=utils.path_to('models')): """ Save classifier Arguments: classifier (sklearn API classifier): classifier object save_as (string): filename model_dir (string): filepath """ try: pickle.dump(obj=classifier, file=open(os.path.join(model_dir, save_as), "wb")) print('Saved: {}.'.format(type(classifier))) except: print('Not successful! {}.'.format(type(classifier)))
def get_point_heatmap(df, location_lng, location_lat, shape_file_name='london_shape.bin', shape_file_path=utils.path_to('src', 'viz')): """ """ # load shape files london_shp = _load_shape_file(shape_file_name, shape_file_path) # data points to gdp gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=[Point(xy) for xy in zip(location_lng, location_lat)]) return gdf, london_shp
def test_ensure_directories(self): folder_name = '-temp-test-folder-should-be-removed' data_path = utils.path_to('data', folder_name, 'data.csv') # Make sure the path does not exists path = os.path.dirname(data_path) if os.path.exists(path): os.rmdir(path) utils.ensure_directories(data_path) self.assertTrue(os.path.exists(path)) # Cleanup os.rmdir(path)
def get_postcode_heatmap(df, target_str, postcode='pickup_postcode_outer', shape_file_name='london_shape.bin', shape_file_path=utils.path_to('src', 'viz')): """ """ # load shape files london_shp = _load_shape_file(shape_file_name, shape_file_path) # Average data within the postcode avg_target = df.groupby(postcode)[target_str].mean().round(1) # merge on postcode index heatmap_gdf = london_shp.merge(avg_target, left_index=True, right_index=True, how='left') return heatmap_gdf
def load_model(filename, model_dir=utils.path_to('models')): """ Load classifier Arguments: filename (string): filename to load model_dir (string): filepath Return: classifier (sklearn API classifier): loaded classifier """ try: classifier = pickle.load(open(os.path.join(model_dir, filename), "rb")) print('Loaded: {}.'.format(type(classifier))) return classifier except: print('Not Successful!')
print(str_series[del_pattern].value_counts()) postcode_df = postcode_df[~del_pattern] str_series = str_series[~del_pattern] # Delete districts outside London del_pattern = ~postcode_df.is_london if del_pattern.sum() > 0: print( 'Deleting {} districts outside London, listing districts with freq >= 100' .format(del_pattern.sum())) print_series = postcode_df.district[del_pattern].value_counts() print(print_series[print_series >= 100]) postcode_df = postcode_df[~del_pattern] postcode_df.drop(columns='is_london', inplace=True) str_series = str_series[~del_pattern] # Delete unrecognized London outcode del_pattern = postcode_df.outcode.isna() if del_pattern.sum() > 0: print('Deleting {} unrecognized London outcodes'.format( del_pattern.sum())) print(str_series[del_pattern].value_counts().head(20)) postcode_df = postcode_df[~del_pattern] return postcode_df if __name__ == "__main__": df = pd.read_feather(utils.path_to('data', 'raw', 'jobs.feather')) clean_dataframe_p1(df.head(100))
def downloading_scripts(file_name,url,download_dir=utils.path_to('data', 'external')): download_path = os.path.join(download_dir,file_name) utils.ensure_directories(download_path) urllib.request.urlretrieve(url, download_path) return(print('file downloaded'))
import pandas as pd import utils def add_event_outcome(jobs, jobs_history): df = jobs.merge(jobs_history, on='job_id', suffixes=['', '_history']) df.event = pd.Categorical(df.event, categories=['accepted', 'rejected']) return df if __name__ == '__main__': INPATH = utils.path_to('data', 'final', 'df_clean_jobs.feather') INPATH_HISTORY = utils.path_to('data', 'raw', 'jobs_history.feather') OUTPATH = utils.path_to('data', 'final', 'df_clean_event.feather') print('Reading feather file from ' + INPATH) jobs = pd.read_feather(INPATH) print('Reading feather file from ' + INPATH_HISTORY) jobs_history = pd.read_feather(INPATH_HISTORY) print('Merging dataframes') df = add_event_outcome(jobs, jobs_history) print('Writing feather file to ' + OUTPATH) utils.ensure_directories(OUTPATH) df.to_feather(OUTPATH)
print('Unchanged features: ' + ', '.join(colnames)) # return unchanged columns of input dataframe if include_pass is True, empty dataframe otherwise df_out = pd.DataFrame() if include_pass: df_out = df[colnames] return df_out def _timed_categories(df, colnames, *, cycletypes): print('Timed categorising: ' + ', '.join(colnames)) # extract attributes of datetime vars df_out = pd.DataFrame() for col in colnames: for cycle in cycletypes: retriever = attrgetter(cycle.attribute) df_out['_'.join([col, cycle.section_name])] = retriever(df[col]) return df_out if __name__ == "__main__": df = pd.read_feather(utils.path_to('data', 'interim', 'clean.feather')) print('Use case 1: Add columns to original df') feature_df, feature_names = generate_features(df.head(100)) print('\nUse case 2: Return only feature columns') intermediate_df = pd.concat( [df.head(100), intermediate_variables(df.head(100))], axis=1) feature_df, feature_names = feature_encoding(intermediate_df)
def test_path_to(self): project_path = utils.project_path() data_path = utils.path_to('data') self.assertEqual(data_path, f'{project_path}/data')
def read_postcode_csv(csv_file): csv_table = pd.read_csv(utils.path_to('src', 'features', csv_file)) zone_lookup = dict(zip(csv_table['postcode'], csv_table['zone'])) return zone_lookup