def load_data(data_dir: WindowsPath, filename: str = None) -> Tuple[pd.DataFrame, pd.DataFrame]: """Loading the two data sets from xlsx files. Keyword arguments data_dir -- a pathlib.WindowsPath object to the DATA folder filename -- a string that contains the location of the data, if None then load the original two Excel files """ if filename: path = data_dir.joinpath(filename) if '.csv' in filename: result = pd.read_csv(path) # Else assume it's a pickle file else: with open(path, 'rb') as infile: result = pickle.load(infile) else: wb = xw.Book(str(data_dir.joinpath('CL_20190823.xlsx'))) cl_df = wb.sheets[0].used_range.options(pd.DataFrame, index=False, header=True).value wb = xw.Book(str(data_dir.joinpath('WF_20190826.xlsx'))) wf_df = wb.sheets[0].used_range.options(pd.DataFrame, index=False, header=True).value wb.app.quit() result = (cl_df, wf_df) return result
def request_api(student_comments: pd.Series, endpoint: str, num_batches: int = 50, save: bool = True, folder: WindowsPath = None) -> pd.Series: """Sends student comments to the LUIS.ai API in batches and saves the intemediate results into the OUTPUT folder Keyword arguments student_comments -- the pd.Series that contains the student comments in natural language endpoint -- the luis endpoint num_batches -- the number of batches into which the comments would be grouped and sent to the api save -- a boolean that saves the raw json response to local disk folder -- a location to which to save the response data, defaults to /OUTPUT/LUIS/ """ if save and folder is None: folder = Path.cwd().joinpath('OUTPUT').joinpath('LUIS') for i, batch in enumerate(np.array_split(student_comments, num_batches)): print(f'Processing batch {i} of {num_batches}:') luis_result = batch.apply(lambda x: requests.get(f'{endpoint}{x}')) # Saving the results to pickle filename = f'luis_result_{str(i).zfill(4)}' luis_result.to_pickle(folder.joinpath(filename)) print(f'Saved to {folder.joinpath(filename)}.')
def path_settings(directory): dest = os.getcwd() if system() == 'Windows': appdata = WindowsPath(os.getenv('APPDATA')) dest = appdata.joinpath(directory) else: dest = PosixPath('/etc').joinpath(directory) dest.mkdir(parents = True, exist_ok = True) return dest
def load_config(filename: str, config_dir: WindowsPath) -> pd.DataFrame: """Load csv config files from the /CONFIG/ folder. Keyword arguments filename -- name of the config file to be loaded, it is assumed to be of csv format data_dir -- a pathlib.WindowsPath object to the DATA folder. """ filename = filename + '.csv' if '.csv' not in filename else filename config_path = config_dir.joinpath(filename) return pd.read_csv(config_path)
def load_column(path: WindowsPath, filename: str): """Returns a pickled Series filename path Keyword arguments path -- a WindowsPath object filename -- a string containing the name of the file """ with open(path.joinpath(filename), 'rb') as infile: s = pickle.load(infile) s.name = filename return s
def save_object(obj, filename: str, output_dir: WindowsPath): """Saves the obj as a pickle file in the output_dir as filename Keyword arguments obj -- the object to be saved filename -- the name of the file output_dir -- a pathlib.WindowsPath object into which the pickle file will be saved """ path = output_dir.joinpath(filename) print(f'Pickling to {path}.') with open(path, 'wb') as outfile: pickle.dump(obj, outfile)
def get_luis_url(folder: WindowsPath = None) -> str: """Create the luis api query url from a csv. Requires the csv to contain the endpoint url, app id and primary key. These can be obtained from the luis.ai site: https://www.luis.ai/applications/{app_ic}/versions/0.1/manage/endpoints """ if folder is None: folder = Path.cwd().joinpath('CONFIG') path = folder.joinpath('luis_keys.csv') df = pd.read_csv(path, index_col='key') endpoint = df.loc['endpoint', 'value'] app_id = df.loc['app_id', 'value'] primary_key = df.loc['subscription_key', 'value'] result = (f'{endpoint}luis/v2.0/apps/{app_id}?verbose=true&timezoneOffset' f'=0&subscription-key={primary_key}&q=') return result
def save_sets(X_train: pd.DataFrame = None, y_train: pd.Series = None, X_val: pd.DataFrame = None, y_val: pd.Series = None, X_test: pd.DataFrame = None, y_test: pd.Series = None, path: WindowsPath = None): """Save the different sets locally Parameters ---------- X_train: Numpy Array Features for the training set y_train: Numpy Array Target for the training set X_val: Numpy Array Features for the validation set y_val: Numpy Array Target for the validation set X_test: Numpy Array Features for the testing set y_test: Numpy Array Target for the testing set path : str Path to the folder where the sets will be saved (default: '../data/processed/') Returns ------- """ if X_train is not None: X_train.to_csv(path.joinpath('X_train.csv'), index=False) if y_train is not None: y_train.to_csv(path.joinpath('y_train.csv'), index=False) if X_val is not None: X_val.to_csv(path.joinpath('X_val.csv'), index=False) if y_val is not None: y_val.to_csv(path.joinpath('y_val.csv'), index=False) if X_test is not None: X_test.to_csv(path.joinpath('X_test.csv'), index=False) if y_test is not None: y_test.to_csv(path.joinpath('y_test.csv'), index=False)