def get_external_data(topic):
    """
    Get and clean production data for specified topic, for out
        of scope utterance detection.
    """

    utils = for_csv.utils(topic)

    responses_todrop = utils.import_csv_to_list(responses_todrop_path)
    button_clicks = utils.import_csv_to_list(buttons_path).tolist()
    follow_up_questions = utils.import_csv_to_list(follow_up_path).tolist()
    utterances_todrop = button_clicks + other_drop + follow_up_questions

    # production data
    csv_path = os.path.join(data_dir, data_file)
    df_external = utils.import_external_data(csv_path, topic)
    process = for_csv.process_dataframe(df_external)
    process.drop_rows_with_column_value(utterance_col, utterances_todrop)
    process.remove_numeric_utterances(chars_toignore=dfclean_specialchars)
    process.remove_date_utterances()
    process.drop_duplicate_utterances(duplicate_thresh=1)
    process.drop_rows_with_column_value(response_col,
                                        responses_todrop,
                                        lower=False)

    production_df = process.get_df()

    return production_df
    def import_training_df(self, train_df):
        """
        Instead of using import_training data, you can also just use an existing dataframe directly
        in the class instance.
        Topic set to none
        """

        ut = for_csv.utils(None)
        train_df = ut.check_questions_df_consistency(
            train_df, to_lower=False, intent_col=self.intent_col)

        self.intents = train_df[self.intent_col].unique()
        self.df_training = train_df
示例#3
0
def main(topic, n_list, absolute):
    def process_list_argument(list_arg):
        """
        An argument entered as a list will be processed as a string.
        This function transforms it into a list.
        """
        list_out = list(map(int, list_arg.strip('[]').split(',')))

        return list_out

    if not absolute:
        norm = True
    else:
        norm = False

    n_list = process_list_argument(n_list)
    utils = for_csv.utils(topic)

    # import training data for topic, and clean
    print('Importing training data for topic ' + topic + '...')
    file_name = topic + '_questions.csv'

    training_path = os.path.join(training_dir, file_name)
    df_training = utils.import_training_data(training_path)
    df_training = utils.check_questions_df_consistency(df_training,
                                                       to_lower=False)

    # get ngrams for each intent and append to main dataframe
    print('Getting ngrams for each intent..')
    intents = df_training['Intent'].unique()
    ngram_freq_dict = pd.DataFrame()

    for intent in intents:
        df_intent = df_training[df_training['Intent'] == intent]
        # TODO: is there an overhead in creating a new class instance for each intent?
        ngrams = for_csv.nlp.ngrams_df(df_intent,
                                       stopwords=stopwords,
                                       utterance_col=utterance_col,
                                       chars_remove=chars_remove)
        temp_freq_dict = ngrams.get_ngram_frequencies(n_list, top_a, norm,
                                                      norm_thresh)
        temp_freq_dict['intent'] = intent

        ngram_freq_dict = ngram_freq_dict.append(temp_freq_dict)

    timestr = time.strftime("%Y%m%d-%H%M")
    filename = 'ngrams_' + topic + '_' + timestr + '_' + str(n_list).strip(
        '[]').replace(' ', '') + '.csv'
    file_path = os.path.join(output_folder, filename)
    ngram_freq_dict.to_csv(file_path, index=False)
    print('Exported csv to ' + file_path)
    def import_training_data(self, topic):
        """
        gets and cleans training data from file specified in config.
        also creates a list of the intents within training
        TODO: pull this out into separate import module
        """
        file_name = topic + '_questions.csv'

        training_path = os.path.join(training_dir, file_name)
        ut = for_csv.utils(topic)
        df_training = ut.import_training_data(training_path)
        df_training = ut.check_questions_df_consistency(df_training,
                                                        to_lower=False)

        self.intents = df_training[self.intent_col].unique()
        self.df_training = df_training
def get_training_data(topic):
    """
    Get and clean training data for a specified topic, for out 
    of scope utterance detection. If topic==None then a dataframe 
    with all the topics specified in config is returned, with a 
    'topic' column indicating the topic. 
    """

    if topic == None:
        df_training = join_all_training_data()
    else:
        utils = for_csv.utils(topic)

        file_name = topic + '_questions.csv'
        training_path = os.path.join(training_dir, file_name)
        df_training = utils.import_training_data(training_path)
        df_training = utils.check_questions_df_consistency(df_training,
                                                           to_lower=False)

    return df_training
示例#6
0
def get_train_test_data(topic, data_type):
    """
    Get and clean training/test data for a specified topic. 
    If topic==None for training data, then a dataframe with 
    all the topics specified in config is returned, with a 
    'topic' column indicating the topic. 
    """

    if data_type == 'test':
        file_ext = '_blindset.csv'
    elif data_type == 'train':
        file_ext = '_questions.csv'

    if (topic == None) & (data_type != 'test'):
        df = join_all_training_data()
    else:
        from for_csv import utils
        utils = utils(topic)

        file_name = topic + file_ext
        training_path = os.path.join(training_dir, file_name)
        df = utils.import_training_data(training_path)

    return df
 def __init__(self, master_df, training_df):
     self.utils = for_csv.utils(topic='master')
     self.master_df = master_df
     self.training_df = training_df
示例#8
0
def get_utterances(trainortest, method, topic, no_utterances, intents):
    import os, importlib, time
    # TODO: don't need to prompt for no_utterances if test option used.

    utils = for_csv.utils(topic,
                          margin_params=margin_params,
                          minhash_params=minhash_params,
                          lowconf_max=lowconf_max)

    # import external lists
    responses_todrop = utils.import_csv_to_list(responses_todrop_path)
    button_clicks = utils.import_csv_to_list(buttons_path).tolist()
    follow_up_questions = utils.import_csv_to_list(follow_up_path).tolist()
    utterances_todrop = button_clicks + other_drop + follow_up_questions

    # import data
    print('Importing external data...')
    csv_path = os.path.join(data_dir, data_file)
    df_external = utils.import_external_data(csv_path, topic)

    print('Importing training data for topic ' + topic + '...')
    file_name = topic + '_questions.csv'

    training_path = os.path.join(training_dir, file_name)
    df_training = utils.import_training_data(training_path)

    # clean training data
    df_training = utils.check_questions_df_consistency(df_training,
                                                       to_lower=False)

    # clean external data (remove button clicks, dates, ..?)
    # TODO: better management of column names. Mapping in dict, in config?
    process = for_csv.process_dataframe(df_external,
                                        utterance_col=utterance_col,
                                        conf1_col=conf1_col)
    process.remove_numeric_utterances(chars_toignore=dfclean_specialchars)
    process.remove_date_utterances()
    if trainortest == 'train':
        process.drop_confidence_greaterthan(max_conf1)
    process.drop_rows_with_column_value(utterance_col, utterances_todrop)
    process.drop_duplicate_utterances(duplicate_thresh=1)
    process.drop_rows_with_column_value(response_col,
                                        responses_todrop,
                                        lower=False)
    df_external = process.get_df()

    # filter by intent (optional)
    if intents:
        intents_list = for_csv.process_list_argument(intents, val_type=str)
        df_external = utils.df_select_specific_intents(
            df_external, intents_list, include_second_intent=True)
        print('Filtered by intents ' + str(intents_list))

    # select utterances
    if trainortest == 'train':
        print('Retrieving utterances for training using method ' + method +
              '...')
        priority_utterances = utils.get_priority_utterances(no_utterances,
                                                            df_external,
                                                            df_training,
                                                            method=method)
    elif trainortest == 'test':
        print("Dropping utterances that exist in training...")
        utterances_in_train = df_training[utterance_col].tolist()
        # new class instance as df_external may have been filtered by intent
        process2 = for_csv.process_dataframe(df_external,
                                             utterance_col=utterance_col,
                                             conf1_col=conf1_col)
        process2.drop_rows_with_column_value(utterance_col,
                                             utterances_in_train)
        priority_utterances = process.get_df()

    # export to csv
    timestr = time.strftime("%Y%m%d-%H%M")
    base_filename = trainortest + '_candidates_' + topic + '_'
    if intents:
        base_filename += intents + '_'

    if trainortest == 'train':
        base_filename += method + '_'

    output_filename = base_filename + timestr + '.csv'
    out_path = os.path.join(output_folder, output_filename)
    priority_utterances.to_csv(out_path, index=False)