def get_single_rep_acc_by_dataset(param_bench, onTestSet=True): """ Computes accuracy for a given dataset, by a given model parameters (representations, schema) Use check_experiments_runs() to verify which experiments parameters are available in the interim data dir """ from sklearn.metrics import accuracy_score from load_data import load_data_from_dir from load_data import load_y_pred accuracy_scores = {} for dataset_name in param_bench['dataset_names']: if onTestSet: _, test_df = load_data_from_dir(dataset_name, rootdir=param_bench['input_dir']) else: test_df, _ = load_data_from_dir(dataset_name, rootdir=param_bench['input_dir']) y_test = test_df[0].tolist() for label_pred_dict in load_y_pred(dataset_name, rootdir=param_bench['output_dir'] + "/single_rep_expe"): y_pred = label_pred_dict['preds'] accuracy_scores[dataset_name] = accuracy_score(y_test, y_pred) break return accuracy_scores
def get_auc_by_dataset(dataset_names, representations, schema, n_features, n_trees, root_dir_raw, root_dir_interim): """ Computes average per-class AUC for a given dataset, by a given model parameters (representations, schema) Use check_experiments_runs() to verify which experiments parameters are available in the interim data dir """ from sklearn.metrics import accuracy_score from load_data import load_data_from_dir from load_data import load_y_scores auc_scores = {} for dataset_name in dataset_names: _, test_df = load_data_from_dir(dataset_name, rootdir=root_dir_raw) y_test = test_df[0].tolist() for label_pred_dict in load_y_scores(dataset_name, rootdir=root_dir_interim): #label_pred_dict = { params, preds} #label_score_dict = { params, scores} if not ( #set(label_pred_dict['params']['representations']) == set(representations) and label_pred_dict['params']['schema'] == schema and label_pred_dict['params']['n_features'] == n_features and label_pred_dict['params']['n_trees'] == n_trees): continue y_score = label_pred_dict['scores'] auc_scores[dataset_name] = avg_auc(y_test, y_score) break return auc_scores
def get_info_by_dataset(dataset_names, root_dir_raw): """ Retrieve dataset info However the info is already available in "data\\results\\all_sota_models_accuracy.csv" """ from load_data import load_data_from_dir dataset_info = { 'dataset_name': dataset_names, 'n_classes': [], 'test_size': [], 'train_size': [], 'min_class_count': [] } for dataset_name in dataset_names: dataset_info = [] train_df, test_df = load_data_from_dir(dataset_name, rootdir=root_dir_raw) # Train/test set size train_size = len(train_df) test_size = len(test_df) # Class count n_classes = len(set(test_df[0].tolist())) # Ratio minority class count_per_class = {c: 0 for c in set(y_train)} for el in y_test: count_per_class[el] += 1 import operator min_class_count = sorted(count_per_class.items(), key=operator.itemgetter(1), reverse=False)[0][1] dataset_info['min_class_count'].append(min_class_count) dataset_info['n_classes'].append(n_classes) dataset_info['test_size'].append(test_size) dataset_info['train_size'].append(train_size) return dataset_info
def get_inputs(dataset_name, data_root, representations, schema, test_size, shuffle=False, nb_sample=None): """ prepare the Train and Test sets including secondary tables """ from load_data import load_data_from_dir train_df, test_df = load_data_from_dir(dataset_name, data_root) # load entire dataset # Sampling the training dataset if nb_sample != None: if train_df.shape[0] > nb_sample: train_df = train_df.sample(n=nb_sample) if test_size is not None: from sklearn.model_selection import train_test_split print("appending train + test") input_df = train_df.append(test_df, ignore_index=True) print("splitting train/test") train_df, test_df = train_test_split(input_df, test_size=test_size, shuffle=shuffle) y_train, y_test = get_y(train_df), get_y(test_df) # extract labels train_ids = pd.DataFrame( index=train_df.index) # extract ids of train samples test_ids = pd.DataFrame(index=test_df.index) # extract ids of test samples #print("[+] Generating representations for TRAIN") additional_tables_train = get_additional_tables(train_df, representations, schema) #print("[+] Generating representations for TEST") additional_tables_test = get_additional_tables(test_df, representations, schema) return additional_tables_train, additional_tables_test, train_ids, test_ids, y_train, y_test
def __init__(self, file, max_length, max_topic_length, word2idx, tword2idx, topic_bs): """ A PyTorch Dataset What we have to do is to implement the 2 abstract methods: - __len__(self): in order to let the DataLoader know the size of our dataset and to perform batching, shuffling and so on... - __getitem__(self, index): we have to return the properly processed data-item from our dataset with a given index Args: file (str): path to the data file max_length (int): the max length for each sentence. if 0 then use the maximum length in the dataset word2idx (dict): a dictionary which maps words to indexes """ self.text_processor = TextPreProcessor( # terms that will be normalized normalize=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], # terms that will be annotated annotate={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons]) self.word2idx = word2idx self.tword2idx = tword2idx print("loading dataset from {}...".format(file)) _data = load_data_from_dir(file) if topic_bs: self.data = [x[2] for x in _data] self.labels = [x[1] for x in _data] self.topics = [x[0] for x in _data] else: self.data = [x[1] for x in _data] self.labels = [x[0] for x in _data] print("Tokenizing...") # self.data = [tokenize(x) for x in self.data] self.data = [self.text_processor.pre_process_doc(x) for x in self.data] self.topics = [ self.text_processor.pre_process_doc(x) for x in self.topics ] # if max_length == 0, then set max_length # to the maximum sentence length in the dataset if max_length == 0: self.max_length = max([len(x) for x in self.data]) else: self.max_length = max_length if max_topic_length == 0: self.max_topic_length = max([len(x) for x in self.topics]) else: self.max_topic_length = max_topic_length # define a mapping for the labels, # for transforming the string labels to numbers self.label_encoder = preprocessing.LabelEncoder() self.label_encoder = self.label_encoder.fit(self.labels) self.label_count = Counter(self.labels) self.weights = [ self.label_count['-1'], self.label_count['2'], self.label_count['0'], self.label_count['1'], self.label_count['2'] ]
def get_summary_df(dataset_names, params, root_dir_raw, root_dir_interim): from load_data import load_data_from_dir from load_data import load_y_pred from load_data import load_y_scores from load_data import load_exp_output from analyze_res import avg_auc from sklearn.metrics import accuracy_score accuracy_score_data = { 'dataset_name': dataset_names, 'accuracy': [], 'avg_auc': [], 'n_classes': [], 'test_size': [], 'train_size': [], 'min_class_count': [] } for dataset_name in dataset_names: accs = [] aucs = [] train_df, test_df = load_data_from_dir(dataset_name, rootdir=root_dir_raw) train_size = len(train_df) test_size = len(test_df) y_test = test_df[0].tolist() y_train = train_df[0].tolist() # Compute aucs, accs for label_pred_dict, label_score_dict in zip( load_y_pred(dataset_name, rootdir=root_dir_interim), load_y_scores(dataset_name, rootdir=root_dir_interim)): # label_pred_dict = { params, preds} # label_score_dict = { params, scores} if not (set(label_pred_dict['params']['representations']) == set( params['representations']) and label_pred_dict['params']['schema'] == params['schema']): #print(set(label_pred_dict['params']['representations'])) #print(label_pred_dict['params']['schema']) continue y_pred = label_pred_dict['preds'] y_score = label_score_dict['scores'] accs.append(accuracy_score(y_test, y_pred)) aucs.append(avg_auc(y_test, y_score)) # Ratio class majoritaire count_per_class = {c: 0 for c in set(y_train)} for el in y_test: count_per_class[el] += 1 import operator min_class_count = sorted(count_per_class.items(), key=operator.itemgetter(1), reverse=False)[0][1] # Fill in result data accuracy_score_data['accuracy'].append(round(max(accs), 4)) accuracy_score_data['avg_auc'].append(round(max(aucs), 4)) accuracy_score_data['min_class_count'].append(min_class_count) accuracy_score_data['n_classes'].append(len(set(y_test))) accuracy_score_data['test_size'].append(test_size) accuracy_score_data['train_size'].append(train_size) import pandas as pd result_df = pd.DataFrame(data=accuracy_score_data).sort_values( by='accuracy', ascending=False) return result_df
def argmax_cg_model_select(dataset_names, representations): """ eg: representations=['TS', 'D', 'DD', 'CUMSUM', 'DCUMSUM', 'ACF'] """ import os import simplejson as json import pandas as pd from load_data import get_dataset_names scores = {} params_set_0 = { 'representations': set(representations), 'schema': 0, 'n_features': 20000 } params_set_1 = { 'representations': set(representations), 'schema': 1, 'n_features': 20000 } data_dpath = "C:\\Users\\rdwp8532\\Desktop\\workThat\\data\\interim" for dname in dataset_names: dpath_interim = f"{data_dpath}\\{dname}" for run_dname in os.listdir(dpath_interim): run_dpath = f"{dpath_interim}\\{run_dname}" for fname in os.listdir(run_dpath): if fname.endswith('.json'): json_fpath = f"{run_dpath}\\{fname}" with open(json_fpath, 'r') as fp: json_content = json.load(fp) params = json_content['params'] params['representations'] = set( json_content['params']['representations']) if params == params_set_0 or params == params_set_1: pykhiops_dirname = [ name for name in os.listdir(run_dpath) if name.startswith('pykhiops_tmp') ][0] pykhiops_dirpath = f"{run_dpath}\\{pykhiops_dirname}" TrainEvaluationReport_fpath = f"{pykhiops_dirpath}\\TrainEvaluationReport.xls" CG = get_cg(TrainEvaluationReport_fpath) if params == params_set_0: CG_0 = CG params_set_0_dpath = run_dpath results_0 = json_content['results'] elif params == params_set_1: CG_1 = CG params_set_1_dpath = run_dpath results_1 = json_content['results'] if CG_0 < CG_1: best_params = params_set_1 best_run_dpath = params_set_1_dpath best_labs_pred = pd.DataFrame.from_dict( results_1)['Predictedtarget'].tolist() else: best_params = params_set_0 best_run_dpath = params_set_0_dpath best_labs_pred = pd.DataFrame.from_dict( results_0)['Predictedtarget'].tolist() # Compute accuracy score of model that has best train CG # Read actual labels from load_data import load_data_from_dir _, test_df = load_data_from_dir(dname) labs_actual = test_df[0].values ## Compute accuracy from sklearn.metrics import accuracy_score acc_score = accuracy_score(best_labs_pred, labs_actual) scores[dname] = acc_score return scores