def get_external_data(topic): """ Get and clean production data for specified topic, for out of scope utterance detection. """ utils = for_csv.utils(topic) responses_todrop = utils.import_csv_to_list(responses_todrop_path) button_clicks = utils.import_csv_to_list(buttons_path).tolist() follow_up_questions = utils.import_csv_to_list(follow_up_path).tolist() utterances_todrop = button_clicks + other_drop + follow_up_questions # production data csv_path = os.path.join(data_dir, data_file) df_external = utils.import_external_data(csv_path, topic) process = for_csv.process_dataframe(df_external) process.drop_rows_with_column_value(utterance_col, utterances_todrop) process.remove_numeric_utterances(chars_toignore=dfclean_specialchars) process.remove_date_utterances() process.drop_duplicate_utterances(duplicate_thresh=1) process.drop_rows_with_column_value(response_col, responses_todrop, lower=False) production_df = process.get_df() return production_df
def import_training_df(self, train_df): """ Instead of using import_training data, you can also just use an existing dataframe directly in the class instance. Topic set to none """ ut = for_csv.utils(None) train_df = ut.check_questions_df_consistency( train_df, to_lower=False, intent_col=self.intent_col) self.intents = train_df[self.intent_col].unique() self.df_training = train_df
def main(topic, n_list, absolute): def process_list_argument(list_arg): """ An argument entered as a list will be processed as a string. This function transforms it into a list. """ list_out = list(map(int, list_arg.strip('[]').split(','))) return list_out if not absolute: norm = True else: norm = False n_list = process_list_argument(n_list) utils = for_csv.utils(topic) # import training data for topic, and clean print('Importing training data for topic ' + topic + '...') file_name = topic + '_questions.csv' training_path = os.path.join(training_dir, file_name) df_training = utils.import_training_data(training_path) df_training = utils.check_questions_df_consistency(df_training, to_lower=False) # get ngrams for each intent and append to main dataframe print('Getting ngrams for each intent..') intents = df_training['Intent'].unique() ngram_freq_dict = pd.DataFrame() for intent in intents: df_intent = df_training[df_training['Intent'] == intent] # TODO: is there an overhead in creating a new class instance for each intent? ngrams = for_csv.nlp.ngrams_df(df_intent, stopwords=stopwords, utterance_col=utterance_col, chars_remove=chars_remove) temp_freq_dict = ngrams.get_ngram_frequencies(n_list, top_a, norm, norm_thresh) temp_freq_dict['intent'] = intent ngram_freq_dict = ngram_freq_dict.append(temp_freq_dict) timestr = time.strftime("%Y%m%d-%H%M") filename = 'ngrams_' + topic + '_' + timestr + '_' + str(n_list).strip( '[]').replace(' ', '') + '.csv' file_path = os.path.join(output_folder, filename) ngram_freq_dict.to_csv(file_path, index=False) print('Exported csv to ' + file_path)
def import_training_data(self, topic): """ gets and cleans training data from file specified in config. also creates a list of the intents within training TODO: pull this out into separate import module """ file_name = topic + '_questions.csv' training_path = os.path.join(training_dir, file_name) ut = for_csv.utils(topic) df_training = ut.import_training_data(training_path) df_training = ut.check_questions_df_consistency(df_training, to_lower=False) self.intents = df_training[self.intent_col].unique() self.df_training = df_training
def get_training_data(topic): """ Get and clean training data for a specified topic, for out of scope utterance detection. If topic==None then a dataframe with all the topics specified in config is returned, with a 'topic' column indicating the topic. """ if topic == None: df_training = join_all_training_data() else: utils = for_csv.utils(topic) file_name = topic + '_questions.csv' training_path = os.path.join(training_dir, file_name) df_training = utils.import_training_data(training_path) df_training = utils.check_questions_df_consistency(df_training, to_lower=False) return df_training
def get_train_test_data(topic, data_type): """ Get and clean training/test data for a specified topic. If topic==None for training data, then a dataframe with all the topics specified in config is returned, with a 'topic' column indicating the topic. """ if data_type == 'test': file_ext = '_blindset.csv' elif data_type == 'train': file_ext = '_questions.csv' if (topic == None) & (data_type != 'test'): df = join_all_training_data() else: from for_csv import utils utils = utils(topic) file_name = topic + file_ext training_path = os.path.join(training_dir, file_name) df = utils.import_training_data(training_path) return df
def __init__(self, master_df, training_df): self.utils = for_csv.utils(topic='master') self.master_df = master_df self.training_df = training_df
def get_utterances(trainortest, method, topic, no_utterances, intents): import os, importlib, time # TODO: don't need to prompt for no_utterances if test option used. utils = for_csv.utils(topic, margin_params=margin_params, minhash_params=minhash_params, lowconf_max=lowconf_max) # import external lists responses_todrop = utils.import_csv_to_list(responses_todrop_path) button_clicks = utils.import_csv_to_list(buttons_path).tolist() follow_up_questions = utils.import_csv_to_list(follow_up_path).tolist() utterances_todrop = button_clicks + other_drop + follow_up_questions # import data print('Importing external data...') csv_path = os.path.join(data_dir, data_file) df_external = utils.import_external_data(csv_path, topic) print('Importing training data for topic ' + topic + '...') file_name = topic + '_questions.csv' training_path = os.path.join(training_dir, file_name) df_training = utils.import_training_data(training_path) # clean training data df_training = utils.check_questions_df_consistency(df_training, to_lower=False) # clean external data (remove button clicks, dates, ..?) # TODO: better management of column names. Mapping in dict, in config? process = for_csv.process_dataframe(df_external, utterance_col=utterance_col, conf1_col=conf1_col) process.remove_numeric_utterances(chars_toignore=dfclean_specialchars) process.remove_date_utterances() if trainortest == 'train': process.drop_confidence_greaterthan(max_conf1) process.drop_rows_with_column_value(utterance_col, utterances_todrop) process.drop_duplicate_utterances(duplicate_thresh=1) process.drop_rows_with_column_value(response_col, responses_todrop, lower=False) df_external = process.get_df() # filter by intent (optional) if intents: intents_list = for_csv.process_list_argument(intents, val_type=str) df_external = utils.df_select_specific_intents( df_external, intents_list, include_second_intent=True) print('Filtered by intents ' + str(intents_list)) # select utterances if trainortest == 'train': print('Retrieving utterances for training using method ' + method + '...') priority_utterances = utils.get_priority_utterances(no_utterances, df_external, df_training, method=method) elif trainortest == 'test': print("Dropping utterances that exist in training...") utterances_in_train = df_training[utterance_col].tolist() # new class instance as df_external may have been filtered by intent process2 = for_csv.process_dataframe(df_external, utterance_col=utterance_col, conf1_col=conf1_col) process2.drop_rows_with_column_value(utterance_col, utterances_in_train) priority_utterances = process.get_df() # export to csv timestr = time.strftime("%Y%m%d-%H%M") base_filename = trainortest + '_candidates_' + topic + '_' if intents: base_filename += intents + '_' if trainortest == 'train': base_filename += method + '_' output_filename = base_filename + timestr + '.csv' out_path = os.path.join(output_folder, output_filename) priority_utterances.to_csv(out_path, index=False)