def full_run_kfold_from_df(self, train_df, method='kfold'): """ Given a training df, will run kfold tests and output all metrics. """ self.check_sufficient_workspaces() folds = self.create_folds(method) self.create_kfold_WA(folds) available_flag = False while available_flag == False: logger.info("Checking workspaces..") available_flag = self.check_workspaces_status() time.sleep(20) try: # results per utterance results_kfold = self.run_kfold_test(folds) # metrics per intent metrics = Metrics(self.threshold) classification_report = metrics.get_all_metrics_CV( results_kfold, fold_col='fold', detailed_results=False) finally: self.delete_kfold_workspaces() return results_kfold, classification_report
def check_sufficient_workspaces(self): """ counting the existing workspaces and check if there is space for k-workspaces """ response = self.assistant.list_workspaces().get_result() k_fold_number = self.n_folds max_workspaces = config.max_workspaces if(len(response['workspaces'])+k_fold_number <= max_workspaces): logger.info("You have space to perform the k-fold test") else: remove = len(response['workspaces'])+k_fold_number-max_workspaces raise ValueError("The K-fold test will make you exceed the {}} workspaces limit. Make " "sure to remove {} workspaces before creating the k-fold workspaces".format(remove, max_workspaces))
def delete_kfold_workspaces(self): """ delete the workspaces when you dont need them anymore """ workspaces = self.workspaces logger.info("Deleting temporary workspaces") for i in range(len(workspaces)): response = self.assistant.delete_workspace( workspace_id=workspaces[i]).get_result() self.workspaces = []
def create_kfold_WA(self, folds): """ create the k-fold workspaces in WA :param folds: are the folds created in the function `create_folds` :return workspaces: is a list of workspaces ID generated """ logger.info("Creating kfold workspaces..") for i in range(len(folds)): train = folds[i]["train"] intents = self.create_intents(train) workspace_id = self.create_workspace(intents, i) self.workspaces.append(workspace_id) return self.workspaces
def create_workspace_from_df(assistant, name, train_df, description="", poll_interval=20): """ Wraps create_intents and create_workspace_from_df, then returns the skill ID when it has been trained. """ intent_json = create_intents(assistant, train_df) skill_id = create_workspace(assistant, name, intent_json, description) status = "" while status != 'Available': logger.info("Waiting for {} skill to finish training..".format(name)) status = assistant.get_workspace( workspace_id=skill_id).get_result()['status'] time.sleep(poll_interval) return skill_id
def run_kfold_test(self, folds): """ run the k-fold test. It is going to take folds as input and it will send the test dataframes to the right workspaces. :param folds: output list from the function `create_folds` :return test_results: is list of results (dataframes) for each fold. """ test_results = pd.DataFrame() for i in range(len(folds)): logger.info("Running test for fold {}".format(i+1)) test_index = folds[i]['test'] df_test = self.training_df.iloc[test_index] df_test_reindexed = df_test.reset_index() results = self.test_kfold(df_test_reindexed, i) results["fold"] = i+1 test_results = test_results.append(results) test_results.loc[:, "intent_correct"] = test_results["intent1"] test_results["intent_correct"] = np.where( (test_results["confidence1"] < self.threshold), "BELOW_THRESHOLD", test_results["intent1"]) return test_results
def main(topic_list, conf_matrix, save_master_data): skill_list = process_list_argument(topic_list, val_type=str) master_skill_id = None # so exception works master_thresh = Credentials.calculate_workspace_thresh("master") try: id_dict = { skill: Credentials.workspace_id[active_adoption][skill] for skill in skill_list } timestr = generate_timestamp() # for use in all filenames # authenticate if "apikey" in instance_creds: logger.debug("Authenticating (apikey)") bs = blindset.blindset( apikey=instance_creds["apikey"], url=instance_creds["url"], version=conversation_version, ) elif "password" in instance_creds: logger.debug("Authenticating (username/password)") bs = blindset.blindset( username=instance_creds["username"], password=instance_creds["password"], url=instance_creds["url"], version=conversation_version, ) # check skills exist check_skills_exist(skill_list) # import blindsets and generate master logger.info("Importing all blindsets and combining into master") blind_dict = dict() for skill in skill_list: bs_path = os.path.join(config.data_dir, f"{skill}_blindset.csv") blind_dict[skill] = bs.import_blindset(bs_path) master_blind_allcols = pd.concat( [v.assign(topic=k) for k, v in blind_dict.items()], axis=0, ignore_index=True, sort=False, ) master_blind = master_blind_allcols[[ "utterance", "topic" ]].rename(columns={"topic": "expected intent"}) # generate master from topic training and push to WA logger.info("Getting training data from WA") train_dict = dict() for skill in skill_list: train_dict[skill] = wa_utils.get_training_data( bs.assistant, id_dict[skill]) logger.info("Creating temporary master skill") master_train = pd.concat( [ v.drop(columns=["intent"]).assign(intent=k) for k, v in train_dict.items() ], axis=0, ignore_index=True, sort=False, ) master_skill_id = wa_utils.create_workspace_from_df( bs.assistant, name="master", train_df=master_train, description="generated by intent_training_tools", ) # run blindset on master logger.info("Running blindset on master..") results_master = bs.run_blind_test(master_blind, master_skill_id, threshold=master_thresh) results_master["routing"] = results_master["intent1"] results_master.loc[results_master["confidence1"] < master_thresh, "routing"] = "anything_else" # create blindsets for topics based on master results newblind_dict = dict() for skill in skill_list: # blindset for each skill is made up of utterances that have landed in that skill for master blind_utterances = results_master.loc[ (results_master["intent1"] == skill) & (results_master["confidence1"] >= master_thresh), "original_text", ].tolist() newblind = master_blind_allcols[master_blind_allcols["utterance"]. isin(blind_utterances)].copy() newblind.loc[newblind["topic"] != skill, "expected intent"] = "anything_else" newblind_dict[skill] = newblind[["utterance", "expected intent" ]].reset_index(drop=True) # run blindsets on topics logger.info("Running blindset on topic skills..") results_dict = dict() for skill in skill_list: results_dict[skill] = bs.run_blind_test( newblind_dict[skill], id_dict[skill], threshold=Credentials.calculate_workspace_thresh(skill), ) # plot confusion matrices if conf_matrix: from conversation_test.confusionmatrix import ConfusionMatrix conf_output_path = lambda s: os.path.join( config.output_folder, f"{s}_multi_confmat_{timestr}.png") # master cfn = ConfusionMatrix(workspace_thresh=master_thresh) cfn.create(results_master, fig_path=conf_output_path("master")) # topics for skill in skill_list: cfn = ConfusionMatrix(workspace_thresh=Credentials. calculate_workspace_thresh(skill)) cfn.create(results_dict[skill], fig_path=conf_output_path(skill)) logger.info("Confusion matrix saved to results folder") # calculate metrics # master met = Metrics(workspace_thresh=master_thresh) metrics_master, _ = met.get_all_metrics(results_master, detailed_results=True) # topics metrics_dict = dict() res_with_conf_dict = dict() for skill in skill_list: met = Metrics( workspace_thresh=Credentials.calculate_workspace_thresh(skill)) metrics_dict[skill], res_with_conf_dict[ skill] = met.get_all_metrics(results_dict[skill], detailed_results=True) # topics - create overall view as if it's a single skill topics_res_with_conf = pd.concat( [v for k, v in res_with_conf_dict.items()], ignore_index=True, sort=False) results_master.loc[results_master["routing"] == "anything_else", 'confusion'] = 'FN' topics_res_with_conf = topics_res_with_conf.append( results_master, ignore_index=True, sort=False, ) metrics_overall = met.calculate_metrics_per_intent( topics_res_with_conf, detailed_results=True) metrics_overall.loc[metrics_overall.index.isin(skill_list), 'threshold'] = master_thresh metrics_overall = metrics_overall.rename( index={s: s + ' - anything else' for s in skill_list}) # export results for skill in skill_list: results_dict[skill].to_csv( os.path.join(config.output_folder, f"{skill}_multi_results_{timestr}.csv"), index=None, ) metrics_dict[skill].to_csv( os.path.join(config.output_folder, f"{skill}_multi_metrics_{timestr}.csv")) results_master.to_csv( os.path.join(config.output_folder, f"master_multi_results_{timestr}.csv"), index=None, ) metrics_master.to_csv( os.path.join(config.output_folder, f"master_multi_metrics_{timestr}.csv")) metrics_overall.to_csv( os.path.join(config.output_folder, f"overall_multi_metrics_{timestr}.csv")) logger.info("Results and metrics saved to output folder") if save_master_data: # export master blindset with both intent and topic labels to CSV master_blind_allcols.to_csv( os.path.join(config.data_dir, f"master_blindset_{timestr}.csv"), index=None, ) # export master training to CSV master_train.to_csv( os.path.join(config.data_dir, f"master_training_{timestr}.csv"), header=None, index=None, ) logger.info( "Master blindset and training have also been saved to the data folder" ) # delete master skill logger.info("Deleting temporary master skill") wa_utils.delete_workspace(bs.assistant, master_skill_id) except Exception as e: if master_skill_id is not None: # make sure master deleted anyway logger.info("Deleting temporary master skill before exit") wa_utils.delete_workspace(bs.assistant, master_skill_id) raise e
def main(input_name, output_name, skill_name, sample_limit): import config # data and output dirs # Read the input file input_path = os.path.join(config.data_dir, input_name) logger.info("Using input {}".format(input_path)) df = pd.read_csv(input_path, header=None) outputFrameP = pd.DataFrame() # create output path if output_name is not None and skill_name is not None: raise ValueError( "Please only specify one of output_name and skill_name.") skill_name = skill_name or 'test' gen_test_set_folder = os.path.join(config.data_dir, 'generated_test_sets') if output_name != None: output_path = os.path.join(gen_test_set_folder, output_name) else: timestr = time.strftime("%Y%m%d-%H%M") output_path = os.path.join( gen_test_set_folder, f"{skill_name}_generator_output_{timestr}.csv") # for every row in the file logger.info("Splitting {} original utterances into permutations..".format( df.shape[0])) for n in range(df.shape[0]): # get the sentence and its intent sen = df.iat[n, 0] intent = df.iat[n, 1] # generate all the different sentences res = sentenceSplitter(sen) # flatten until flat (from nested lists to a single list) while (any(isinstance(el, list) for el in res)): res = flatten(res) # Distinguish between single elements and lists if (type(res) == type([])): #list res = map(lambda x: (x, intent), res) outputFrame = pd.DataFrame(res) outputFrameP = outputFrameP.append(outputFrame) else: #single element res = [(res, intent)] outputFrameP = outputFrameP.append(res) # reduce number of samples in final test set, to limit API calls when running tests # TODO: better alternative than random? if sample_limit is not None: # Take stratified sample over intents, returning all samples in the intent if there aren't enough. # Make up for the difference by randomly sampling from the remaining records that haven't been chosen yet. logger.info("Returned {} utterance in total; reducing to {}".format( len(outputFrameP), sample_limit)) samples_per_intent = np.floor(outputFrameP[1].value_counts() * sample_limit / len(outputFrameP)).apply(int) df_sampled = pd.DataFrame() for intent in outputFrameP[1].unique(): df_intent = outputFrameP[outputFrameP[1] == intent] if samples_per_intent[intent] > len(df_intent): df_sampled = df_sampled.append(df_intent) else: df_sampled = df_sampled.append( df_intent.sample(samples_per_intent[intent])) # make up for rounding errors lendiff = sample_limit - len(df_sampled) extra_records = pd.concat( [outputFrameP, df_sampled]).drop_duplicates(keep=False).sample(lendiff) df_sampled = df_sampled.append(extra_records) output_df = df_sampled else: output_df = outputFrameP # write the frame to CSV without the idexes output_df.to_csv(output_path, header=None, index=False) logger.info("{} new utterances saved to {}".format(len(output_df), output_path))
def run_kfold(topic, no_folds, results_type, conf_matrix): """ Runs kfold test using credentials in ../Credentials.py """ # get credentials, import + export folders import Credentials active_adoption = Credentials.active_adoption instance_creds = Credentials.ctx[active_adoption] workspace_id = Credentials.workspace_id[active_adoption][topic] workspace_thresh = Credentials.calculate_workspace_thresh(topic) conversation_version = Credentials.conversation_version # import + export folders import config import time data_folder = config.data_dir export_folder = config.output_folder timestr = time.strftime("%Y%m%d-%H%M") output_loc_results = os.path.join( export_folder, "{}_kfold_results_raw_{}.csv".format(topic, timestr)) output_loc_metrics = os.path.join( export_folder, "{}_kfold_results_metrics_{}.csv".format(topic, timestr)) output_loc_confmat = os.path.join( export_folder, "{}_kfold_confmat_{}.png".format(topic, timestr)) # authenticate if 'apikey' in instance_creds: logger.debug("Authenticating (apikey)") kf = kfoldtest(n_folds=no_folds, apikey=instance_creds['apikey'], url=instance_creds['url'], threshold=workspace_thresh, version=conversation_version) elif 'password' in instance_creds: logger.debug("Authenticating (username/password)") kf = kfoldtest(n_folds=no_folds, username=instance_creds['username'], password=instance_creds['password'], url=instance_creds['url'], threshold=workspace_thresh, version=conversation_version) # get train df from watson + check there are sufficient workspaces to run the test train_df = kf.intent_df_from_watson(workspace_id) kf.check_sufficient_workspaces() # create folds in WA if above is true folds = kf.create_folds(method='kfold') kf.create_kfold_WA(folds) available_flag = False while available_flag == False: logger.info("Checking workspaces..") available_flag = kf.check_workspaces_status() time.sleep(20) # run kfold test try: results = kf.run_kfold_test(folds) if (results_type == 'raw') or (results_type == 'all'): results.to_csv(output_loc_results) classification_report = kf.create_classification_report(results) if (results_type == 'metrics') or (results_type == 'all'): metrics = Metrics(workspace_thresh) metric_df = metrics.get_all_metrics_CV( results, fold_col='fold', detailed_results=False) metric_df.to_csv(output_loc_metrics) # TODO: confusion matrix if conf_matrix: from confusionmatrix import ConfusionMatrix cfn = ConfusionMatrix(workspace_thresh=workspace_thresh) cfn.create(results, fig_path=output_loc_confmat) logger.info("Confusion matrix saved to {}".format( output_loc_confmat)) finally: # regardless of what happens above, delete the temporary workspaces before exiting kf.delete_kfold_workspaces()
def run_blindset(topic, results_type, conf_matrix, blindset_name): """ Runs blindset test using credentials in ../Credentials.py """ # get credentials, import + export folders import Credentials active_adoption = Credentials.active_adoption instance_creds = Credentials.ctx[active_adoption] print(instance_creds) print('print works') workspace_id = Credentials.workspace_id[active_adoption][topic] workspace_thresh = Credentials.calculate_workspace_thresh(topic) conversation_version = Credentials.conversation_version # import + export folders import config import time data_folder = config.data_dir export_folder = config.output_folder timestr = time.strftime("%Y%m%d-%H%M") blindset_name = blindset_name or topic + "_blindset.csv" output_loc_results = os.path.join( export_folder, "{}_results_raw_{}.csv".format(topic, timestr)) output_loc_metrics = os.path.join( export_folder, "{}_results_metrics_{}.csv".format(topic, timestr)) output_loc_confmat = os.path.join( export_folder, "{}_confmat_{}.png".format(topic, timestr)) # authenticate if 'apikey' in instance_creds: logger.debug("Authenticating (apikey)") bs = blindset(apikey=instance_creds['apikey'], url=instance_creds['url'], threshold=workspace_thresh, version=conversation_version) elif 'password' in instance_creds: logger.debug("Authenticating (username/password)") bs = blindset(username=instance_creds['username'], password=instance_creds['password'], url=instance_creds['url'], threshold=workspace_thresh, version=conversation_version) # run test blindset_df = bs.import_blindset(os.path.join(data_folder, blindset_name)) # TODO: check blindset df results = bs.run_blind_test(blindset_df, workspace_id) # exports + metrics if (results_type == 'raw') or (results_type == 'all'): cols_export = [ col for col in results.columns.values if col != 'intent_correct' ] results[cols_export].to_csv(output_loc_results, encoding='utf-8') logger.info("Raw results exported to {}".format(output_loc_results)) if (results_type == 'metrics') or (results_type == 'all'): met = Metrics(workspace_thresh) metric_df, _ = met.get_all_metrics(results, detailed_results=True) metric_df.to_csv(output_loc_metrics, encoding='utf-8') logger.info( "Metrics per intent exported to {}".format(output_loc_metrics)) # confusion matrix if conf_matrix: from confusionmatrix import ConfusionMatrix cfn = ConfusionMatrix(workspace_thresh=workspace_thresh) cfn.create(results, fig_path=output_loc_confmat) #bs.plot_confusion_matrix(results, output_loc_confmat) logger.info("Confusion matrix saved to {}".format(output_loc_confmat)) # print high-level metrics overall_metrics = bs.calculate_overall_metrics(results, av_method="weighted") logger.info("Overall metrics for the workspace (weighted):") logger.info(overall_metrics)
def run_blind_test(self, test_set_df, workspace_id, **kwargs): """ Runs blind set test and returns results df. Parameter: test_set_df: the regression_test in csv format Return: results: a Pandas dataframe with `original text`, `predicted intent` and also the results from WA """ # if no threshold has been passed into the object, take one from the function args if self.threshold == False and 'threshold' not in kwargs: raise ValueError( "Must provide a threshold either to the blindset object or this function." ) elif 'threshold' in kwargs: # threshold in function args overwrites one provided to the object, even if one has been set threshold = kwargs['threshold'] else: threshold = self.threshold results = pd.DataFrame(columns=[ 'original_text', 'expected intent', 'r@1', 'TP', 'intent1', 'confidence1', 'intent2', 'confidence2', 'intent3', 'confidence3' ]) logger.info("Running blind test...") for i in tqdm(range(len(test_set_df))): text = test_set_df["utterance"][i] response = self.assistant.message( workspace_id=workspace_id, input={'text': text}, context={'metadata': { 'user_id': 'intent_test_user1' }}, alternate_intents=True) dumps = json.dumps(response.get_result(), indent=2) data = json.loads(dumps) no_intents = len(data['intents']) intent1 = data['intents'][0]['intent'] confidence1 = data['intents'][0]['confidence'] if no_intents >= 2: intent2 = data['intents'][1]['intent'] confidence2 = data['intents'][1]['confidence'] else: intent2 = confidence2 = "" if no_intents >= 3: intent3 = data['intents'][2]['intent'] confidence3 = data['intents'][2]['confidence'] else: intent3 = confidence3 = "" r_1 = (test_set_df["expected intent"][i] == intent1) tp = r_1 and (confidence1 >= self.threshold) results = results.append( { 'original_text': test_set_df["utterance"][i], 'expected intent': test_set_df["expected intent"][i], 'r@1': 1 * r_1, 'TP': 1 * tp, 'intent1': intent1, 'confidence1': confidence1, 'intent2': intent2, 'confidence2': confidence2, 'intent3': intent3, 'confidence3': confidence3, }, ignore_index=True) results["intent_correct"] = results["intent1"] results["intent_correct"] = np.where( (results["confidence1"] < self.threshold), "BELOW_THRESHOLD", results["intent1"]) return results