Пример #1
0
def get_single_rep_acc_by_dataset(param_bench, onTestSet=True):
    """
    Computes accuracy for a given dataset, by a given model parameters (representations, schema)
    Use check_experiments_runs() to verify which experiments parameters are available in the interim data dir
    """

    from sklearn.metrics import accuracy_score
    from load_data import load_data_from_dir
    from load_data import load_y_pred

    accuracy_scores = {}
    for dataset_name in param_bench['dataset_names']:
        if onTestSet:
            _, test_df = load_data_from_dir(dataset_name,
                                            rootdir=param_bench['input_dir'])
        else:
            test_df, _ = load_data_from_dir(dataset_name,
                                            rootdir=param_bench['input_dir'])

        y_test = test_df[0].tolist()

        for label_pred_dict in load_y_pred(dataset_name,
                                           rootdir=param_bench['output_dir'] +
                                           "/single_rep_expe"):
            y_pred = label_pred_dict['preds']
            accuracy_scores[dataset_name] = accuracy_score(y_test, y_pred)
            break

    return accuracy_scores
Пример #2
0
def get_auc_by_dataset(dataset_names, representations, schema, n_features,
                       n_trees, root_dir_raw, root_dir_interim):
    """
    Computes average per-class AUC for a given dataset, by a given model parameters (representations, schema)
    Use check_experiments_runs() to verify which experiments parameters are available in the interim data dir
    """

    from sklearn.metrics import accuracy_score
    from load_data import load_data_from_dir
    from load_data import load_y_scores

    auc_scores = {}
    for dataset_name in dataset_names:
        _, test_df = load_data_from_dir(dataset_name, rootdir=root_dir_raw)
        y_test = test_df[0].tolist()

        for label_pred_dict in load_y_scores(dataset_name,
                                             rootdir=root_dir_interim):

            #label_pred_dict = { params, preds}
            #label_score_dict = { params, scores}
            if not (
                    #set(label_pred_dict['params']['representations']) == set(representations) and
                    label_pred_dict['params']['schema'] == schema
                    and label_pred_dict['params']['n_features'] == n_features
                    and label_pred_dict['params']['n_trees'] == n_trees):
                continue

            y_score = label_pred_dict['scores']
            auc_scores[dataset_name] = avg_auc(y_test, y_score)
            break

    return auc_scores
Пример #3
0
def get_info_by_dataset(dataset_names, root_dir_raw):
    """
    Retrieve dataset info
    However the info is already available in
    "data\\results\\all_sota_models_accuracy.csv"
    """

    from load_data import load_data_from_dir

    dataset_info = {
        'dataset_name': dataset_names,
        'n_classes': [],
        'test_size': [],
        'train_size': [],
        'min_class_count': []
    }

    for dataset_name in dataset_names:

        dataset_info = []

        train_df, test_df = load_data_from_dir(dataset_name,
                                               rootdir=root_dir_raw)
        # Train/test set size
        train_size = len(train_df)
        test_size = len(test_df)

        # Class count
        n_classes = len(set(test_df[0].tolist()))

        # Ratio minority class
        count_per_class = {c: 0 for c in set(y_train)}
        for el in y_test:
            count_per_class[el] += 1

        import operator
        min_class_count = sorted(count_per_class.items(),
                                 key=operator.itemgetter(1),
                                 reverse=False)[0][1]

        dataset_info['min_class_count'].append(min_class_count)
        dataset_info['n_classes'].append(n_classes)
        dataset_info['test_size'].append(test_size)
        dataset_info['train_size'].append(train_size)

    return dataset_info
Пример #4
0
def get_inputs(dataset_name,
               data_root,
               representations,
               schema,
               test_size,
               shuffle=False,
               nb_sample=None):
    """
    prepare the Train and Test sets including secondary tables
    """

    from load_data import load_data_from_dir
    train_df, test_df = load_data_from_dir(dataset_name,
                                           data_root)  # load entire dataset

    # Sampling the training dataset

    if nb_sample != None:
        if train_df.shape[0] > nb_sample:
            train_df = train_df.sample(n=nb_sample)

    if test_size is not None:
        from sklearn.model_selection import train_test_split
        print("appending train + test")
        input_df = train_df.append(test_df, ignore_index=True)
        print("splitting train/test")
        train_df, test_df = train_test_split(input_df,
                                             test_size=test_size,
                                             shuffle=shuffle)

    y_train, y_test = get_y(train_df), get_y(test_df)  # extract labels

    train_ids = pd.DataFrame(
        index=train_df.index)  # extract ids of train samples
    test_ids = pd.DataFrame(index=test_df.index)  # extract ids of test samples

    #print("[+] Generating representations for TRAIN")
    additional_tables_train = get_additional_tables(train_df, representations,
                                                    schema)

    #print("[+] Generating representations for TEST")
    additional_tables_test = get_additional_tables(test_df, representations,
                                                   schema)

    return additional_tables_train, additional_tables_test, train_ids, test_ids, y_train, y_test
Пример #5
0
    def __init__(self, file, max_length, max_topic_length, word2idx, tword2idx,
                 topic_bs):
        """
        A PyTorch Dataset
        What we have to do is to implement the 2 abstract methods:

            - __len__(self): in order to let the DataLoader know the size
                of our dataset and to perform batching, shuffling and so on...
            - __getitem__(self, index): we have to return the properly
                processed data-item from our dataset with a given index

        Args:
            file (str): path to the data file
            max_length (int): the max length for each sentence.
                if 0 then use the maximum length in the dataset
            word2idx (dict): a dictionary which maps words to indexes
        """

        self.text_processor = TextPreProcessor(
            # terms that will be normalized
            normalize=[
                'url', 'email', 'percent', 'money', 'phone', 'user', 'time',
                'url', 'date', 'number'
            ],
            # terms that will be annotated
            annotate={
                "hashtag", "allcaps", "elongated", "repeated", 'emphasis',
                'censored'
            },
            fix_html=True,  # fix HTML tokens

            # corpus from which the word statistics are going to be used
            # for word segmentation
            segmenter="twitter",

            # corpus from which the word statistics are going to be used
            # for spell correction
            corrector="twitter",
            unpack_hashtags=True,  # perform word segmentation on hashtags
            unpack_contractions=True,  # Unpack contractions (can't -> can not)
            spell_correct_elong=False,  # spell correction for elongated words

            # select a tokenizer. You can use SocialTokenizer, or pass your own
            # the tokenizer, should take as input a string and return a list of tokens
            tokenizer=SocialTokenizer(lowercase=True).tokenize,

            # list of dictionaries, for replacing tokens extracted from the text,
            # with other expressions. You can pass more than one dictionaries.
            dicts=[emoticons])

        self.word2idx = word2idx
        self.tword2idx = tword2idx

        print("loading dataset from {}...".format(file))
        _data = load_data_from_dir(file)
        if topic_bs:
            self.data = [x[2] for x in _data]
            self.labels = [x[1] for x in _data]
            self.topics = [x[0] for x in _data]
        else:
            self.data = [x[1] for x in _data]
            self.labels = [x[0] for x in _data]

        print("Tokenizing...")
        # self.data = [tokenize(x) for x in self.data]
        self.data = [self.text_processor.pre_process_doc(x) for x in self.data]
        self.topics = [
            self.text_processor.pre_process_doc(x) for x in self.topics
        ]

        # if max_length == 0, then set max_length
        # to the maximum sentence length in the dataset
        if max_length == 0:
            self.max_length = max([len(x) for x in self.data])
        else:
            self.max_length = max_length

        if max_topic_length == 0:
            self.max_topic_length = max([len(x) for x in self.topics])
        else:
            self.max_topic_length = max_topic_length

        # define a mapping for the labels,
        # for transforming the string labels to numbers
        self.label_encoder = preprocessing.LabelEncoder()
        self.label_encoder = self.label_encoder.fit(self.labels)

        self.label_count = Counter(self.labels)
        self.weights = [
            self.label_count['-1'], self.label_count['2'],
            self.label_count['0'], self.label_count['1'], self.label_count['2']
        ]
def get_summary_df(dataset_names, params, root_dir_raw, root_dir_interim):

    from load_data import load_data_from_dir
    from load_data import load_y_pred
    from load_data import load_y_scores
    from load_data import load_exp_output

    from analyze_res import avg_auc
    from sklearn.metrics import accuracy_score

    accuracy_score_data = {
        'dataset_name': dataset_names,
        'accuracy': [],
        'avg_auc': [],
        'n_classes': [],
        'test_size': [],
        'train_size': [],
        'min_class_count': []
    }

    for dataset_name in dataset_names:

        accs = []
        aucs = []

        train_df, test_df = load_data_from_dir(dataset_name,
                                               rootdir=root_dir_raw)
        train_size = len(train_df)
        test_size = len(test_df)

        y_test = test_df[0].tolist()
        y_train = train_df[0].tolist()

        # Compute aucs, accs
        for label_pred_dict, label_score_dict in zip(
                load_y_pred(dataset_name, rootdir=root_dir_interim),
                load_y_scores(dataset_name, rootdir=root_dir_interim)):

            # label_pred_dict = { params, preds}
            # label_score_dict = { params, scores}
            if not (set(label_pred_dict['params']['representations']) == set(
                    params['representations']) and
                    label_pred_dict['params']['schema'] == params['schema']):
                #print(set(label_pred_dict['params']['representations']))
                #print(label_pred_dict['params']['schema'])
                continue

            y_pred = label_pred_dict['preds']
            y_score = label_score_dict['scores']

            accs.append(accuracy_score(y_test, y_pred))
            aucs.append(avg_auc(y_test, y_score))

        # Ratio class majoritaire
        count_per_class = {c: 0 for c in set(y_train)}
        for el in y_test:
            count_per_class[el] += 1

        import operator
        min_class_count = sorted(count_per_class.items(),
                                 key=operator.itemgetter(1),
                                 reverse=False)[0][1]

        # Fill in result data
        accuracy_score_data['accuracy'].append(round(max(accs), 4))
        accuracy_score_data['avg_auc'].append(round(max(aucs), 4))

        accuracy_score_data['min_class_count'].append(min_class_count)
        accuracy_score_data['n_classes'].append(len(set(y_test)))
        accuracy_score_data['test_size'].append(test_size)
        accuracy_score_data['train_size'].append(train_size)

    import pandas as pd
    result_df = pd.DataFrame(data=accuracy_score_data).sort_values(
        by='accuracy', ascending=False)

    return result_df
Пример #7
0
def argmax_cg_model_select(dataset_names, representations):
    """
	eg: representations=['TS', 'D', 'DD', 'CUMSUM', 'DCUMSUM', 'ACF']
	"""

    import os
    import simplejson as json
    import pandas as pd
    from load_data import get_dataset_names

    scores = {}

    params_set_0 = {
        'representations': set(representations),
        'schema': 0,
        'n_features': 20000
    }

    params_set_1 = {
        'representations': set(representations),
        'schema': 1,
        'n_features': 20000
    }

    data_dpath = "C:\\Users\\rdwp8532\\Desktop\\workThat\\data\\interim"

    for dname in dataset_names:
        dpath_interim = f"{data_dpath}\\{dname}"

        for run_dname in os.listdir(dpath_interim):
            run_dpath = f"{dpath_interim}\\{run_dname}"

            for fname in os.listdir(run_dpath):
                if fname.endswith('.json'):
                    json_fpath = f"{run_dpath}\\{fname}"
                    with open(json_fpath, 'r') as fp:
                        json_content = json.load(fp)

                    params = json_content['params']
                    params['representations'] = set(
                        json_content['params']['representations'])

                    if params == params_set_0 or params == params_set_1:
                        pykhiops_dirname = [
                            name for name in os.listdir(run_dpath)
                            if name.startswith('pykhiops_tmp')
                        ][0]
                        pykhiops_dirpath = f"{run_dpath}\\{pykhiops_dirname}"
                        TrainEvaluationReport_fpath = f"{pykhiops_dirpath}\\TrainEvaluationReport.xls"
                        CG = get_cg(TrainEvaluationReport_fpath)

                        if params == params_set_0:
                            CG_0 = CG
                            params_set_0_dpath = run_dpath
                            results_0 = json_content['results']
                        elif params == params_set_1:
                            CG_1 = CG
                            params_set_1_dpath = run_dpath
                            results_1 = json_content['results']

        if CG_0 < CG_1:
            best_params = params_set_1
            best_run_dpath = params_set_1_dpath
            best_labs_pred = pd.DataFrame.from_dict(
                results_1)['Predictedtarget'].tolist()
        else:
            best_params = params_set_0
            best_run_dpath = params_set_0_dpath
            best_labs_pred = pd.DataFrame.from_dict(
                results_0)['Predictedtarget'].tolist()

        # Compute accuracy score of model that has best train CG
        # Read actual labels
        from load_data import load_data_from_dir
        _, test_df = load_data_from_dir(dname)
        labs_actual = test_df[0].values
        ## Compute accuracy
        from sklearn.metrics import accuracy_score
        acc_score = accuracy_score(best_labs_pred, labs_actual)

        scores[dname] = acc_score

    return scores