# Will be sufficient and leaves 90% of the data for training purposes n_split = 10 k_fold = IterativeStratification(n_splits=n_split, order=1) i = 1 # This is just to check that splits are somewhat correct and then crucially saves the data set splits for each fold # Into a file for analysis later # Save the training and validation indices for each fold so that we can use them on the server train_indices = [] val_indices = [] for train, val in k_fold.split(total_x, svm_y): print("Fold" + str(i)) temp_1 = proportions(y[train]) temp_2 = proportions(y[val]) print("Train set") print(temp_1) print("Validation set") print(temp_2) i += 1 train_indices.append(train) val_indices.append(val) # In[ ]: # Now save the indices for GPU use on server #with open('RNN Training Indices.pkl', 'wb') as f:
def _featurecollection(self) -> Tuple[np.array, np.array, list]: """ Runs the feature collection workflow. Returns: Tuple[np.array, np.array, list] -- numpy arrays of features and labels and list of names """ feature_list = FeatureCollector.create_feature_list( self.picklefiles, self.forbidden_list, self.old_format) label_raw = read_pickle(self.labelpath) # collectorlogger.info(f'found {len(label_raw)} labels') label_list = FeatureCollector.make_labels_table(label_raw) df = FeatureCollector._create_clean_dataframe(feature_list, label_list, self.drop_duplicates) # shuffle dataframe for the next steps to ensure randomization df = df.sample(frac=1).reset_index(drop=True) # set offset of select features offset = 0 if self.racsdf is not None: offset = len(self.selected_racs) df = FeatureCollector._merge_racs_frame(df, self.racsdf, self.selected_racs) if self.percentage_holdout > 0: # Make stratified split that also makes sure that no structure from the training set is in the test set # This is important as the chmemical enviornments in structures can be quite similar (parsiomny principle of Pauling) # We do not want to leak this information from training into test set df["base_name"] = [n.strip("0123456789") for n in df["name"]] df_name_select = df.drop_duplicates(subset=["base_name"]) df_name_select["numbers"] = ( df_name_select["metal"].astype("category").cat.codes) stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[ self.percentage_holdout, 1.0 - self.percentage_holdout, ], ) train_indexes, test_indexes = next( stratifier.split(df_name_select, df_name_select[["oxidationstate", "numbers"]])) train_names = df_name_select.iloc[train_indexes] test_names = df_name_select.iloc[test_indexes] train_names = list(train_names["base_name"]) test_names = list(test_names["base_name"]) df_train = df[df["base_name"].isin(train_names)] df_test = df[df["base_name"].isin(test_names)] x, self.y, self.names = FeatureCollector._get_x_y_names(df_train) self.x = FeatureCollector._select_features(self.selected_features, x, self.outdir_helper, offset) x_test, self.y_test, self.names_test = FeatureCollector._get_x_y_names( df_test) self.x_test = FeatureCollector._select_features( self.selected_features, x_test, self.outdir_helper, offset) else: # no seperate holdout set x, self.y, self.names = FeatureCollector._get_x_y_names(df) if ( self.training_set_size ): # perform farthest point sampling to selet a fixed number of training points collectorlogger.debug( "will now perform farthest point sampling on the feature matrix" ) # Write one additional holdout set assert self.training_set_size < len(df_train) x, self.y, self.names = FeatureCollector._get_x_y_names(df_train) x = FeatureCollector._select_features(self.selected_features, x, self.outdir_helper, offset) # indices = greedy_farthest_point_samples(x, self.training_set_size) indices = apricot_select(x, self.training_set_size) _df_train = df_train good_indices = _df_train.index.isin(indices) df_train = _df_train[good_indices] x, self.y, self.names = FeatureCollector._get_x_y_names(df_train) df_validation = _df_train[~good_indices] x_valid, self.y_valid, self.names_valid = FeatureCollector._get_x_y_names( df_validation) self.x_valid = FeatureCollector._select_features( self.selected_features, x_valid, self.outdir_helper, offset) self.x = FeatureCollector._select_features(self.selected_features, x, self.outdir_helper, offset) collectorlogger.debug("the feature matrix shape is %s", self.x.shape)
# Will be sufficient and leaves 90% of the data for training purposes n_split = 10 k_fold = IterativeStratification(n_splits=n_split, order=1) i = 1 # This is just to check that splits are somewhat correct and then crucially saves the data set splits for each fold # Into a file for analysis later # Save the training and validation indices for each fold so that we can use them on the server train_indices = [] val_indices = [] for train, val in k_fold.split(x, y): print("Fold" + str(i)) temp_1 = proportions(y[train]) temp_2 = proportions(y[val]) print("Train set") print(temp_1) print("Validation set") print(temp_2) i += 1 train_indices.append(train) val_indices.append(val) # In[ ]: # Now save the indices for GPU use on server as we cannot download the multilearn on there with open('RNN Training Indices.pkl', 'wb') as f:
def test_if_stratification_works(self): stratifier = IterativeStratification(n_splits=2, order=1) X = np.matrix([[0], [1], [2], [3]]) y = np.matrix([[0, 0], [1, 0], [0, 1], [1, 1]]) self.assertEqual(len(list(stratifier.split(X, y))), 2)
def cross_validation(clfs, X, y_true, cardio_dict, gender_dict, num_fold=10): from sklearn.model_selection import StratifiedKFold from prettytable import PrettyTable from tensorflow.keras.utils import to_categorical from skmultilearn.model_selection import IterativeStratification skf = IterativeStratification(n_splits=num_fold, random_state=10) wrong_instances_clf = {} for clf_name in clfs: avg_matrics_cardio = [0, 0, 0, 0] avg_matrics_gender = [0, 0, 0, 0] print('we are trying classifier: ', clf_name) clf = clfs[clf_name] best_f_score = 0 best_clf = None i = 1 t_cardio = PrettyTable( ['Fold', 'acc', 'precision', 'recall', 'f_score']) # t_cardio.title = 'cardio' t_gender = PrettyTable( ['Fold', 'acc', 'precision', 'recall', 'f_score']) # t_gender.title = 'gender' wrong_instances = [] for train_index, val_index in skf.split(X, y_true): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y_true[train_index], y_true[val_index] backtrack_dict = {} for index in range(len(val_index)): backtrack_dict[index] = val_index[index] clf.fit(X_train, y_train, verbose=0, epochs=100, batch_size=8192) acc, precision, recall, f_score = model_evaluation( clf, X_val, y_val, cardio_dict, gender_dict) data_util.reset_weights(clf) # wrong_instances.extend(wrong_instances_fold) # print(' '+str(i)+' ', acc, precision, recall, f_score) t_cardio.add_row([ ' ' + str(i) + ' ', '{0:.3f}'.format(acc[0]), '{0:.3f}'.format(precision[0]), '{0:.3f}'.format(recall[0]), '{0:.3f}'.format(f_score[0]) ]) t_gender.add_row([ ' ' + str(i) + ' ', '{0:.3f}'.format(acc[1]), '{0:.3f}'.format(precision[1]), '{0:.3f}'.format(recall[1]), '{0:.3f}'.format(f_score[1]) ]) avg_matrics_cardio[0] += acc[0] avg_matrics_cardio[1] += precision[0] avg_matrics_cardio[2] += recall[0] avg_matrics_cardio[3] += f_score[0] avg_matrics_gender[0] += acc[1] avg_matrics_gender[1] += precision[1] avg_matrics_gender[2] += recall[1] avg_matrics_gender[3] += f_score[1] i += 1 i -= 1 t_cardio.add_row([ 'avg', '{0:.3f}'.format(avg_matrics_cardio[0] / i), '{0:.3f}'.format(avg_matrics_cardio[1] / i), '{0:.3f}'.format(avg_matrics_cardio[2] / i), '{0:.3f}'.format(avg_matrics_cardio[3] / i) ]) t_gender.add_row([ 'avg', '{0:.3f}'.format(avg_matrics_gender[0] / i), '{0:.3f}'.format(avg_matrics_gender[1] / i), '{0:.3f}'.format(avg_matrics_gender[2] / i), '{0:.3f}'.format(avg_matrics_cardio[3] / i) ]) # print('avg',avg_matrics[0]/i,avg_matrics[1]/i,avg_matrics[2]/i,avg_matrics[3]/i) # wrong_instances_clf[clf_name] = wrong_instances print('+--------------------------------------------+') print('| cardio |') print('+--------------------------------------------+') print(t_cardio) print('+--------------------------------------------+') print('| gender |') print('+--------------------------------------------+') print(t_gender) return
def eval_model(model, X_train, y_train, id_=None): start_time = time() logging.info('*' * 20) logging.info("Evaluating model {}".format(id_ if id_ else model)) n_splits = 3 output = None # Try to load saved result from disk if exists if id_: output = Path('output') / id_ output.mkdir(parents=True, exist_ok=True) if path.exists(output / 'score.pkl'): logging.debug("Loading result from disk") log_loss_, auc, f1 = pickle.load(open(output / 'score.pkl', 'rb')) logging.info("The Average Log Loss is {}".format(log_loss_)) logging.info("The Average AUC is {}".format(auc)) logging.info("The Average f1 is {}".format(f1)) return log_loss_, auc, f1 # Deprecated sklearn k-forld # kf = StratifiedKFold(n_splits=n_splits) # kf.get_n_splits(X_train) kf = IterativeStratification(n_splits=3, order=1) log_loss_, auc, f1 = 0.0, 0.0, 0.0 for i, (train_index, test_index) in enumerate(kf.split(X_train, y_train)): X_train_, X_val_ = X_train.iloc[train_index].values, X_train.iloc[ test_index].values y_train_, y_val_ = y_train.iloc[train_index].values, y_train.iloc[ test_index].values # Add dummy sample to make sure every column has 2 labels X_train_ = np.vstack((X_train_, np.zeros((1, X_train_.shape[1])))) y_train_ = np.vstack((y_train_, np.ones((1, y_train_.shape[1])))) model.fit(X_train_, y_train_) y_pred_ = model.predict(X_val_) log_loss_val, auc_val, f1_val = scorer(y_val_, y_pred_) # Pickle y_val_ and y_pred_ if id_: pickle.dump((y_val_, y_pred_), open(output / "val_{}.pkl".format(i), 'wb')) # Update the scores log_loss_ += log_loss_val auc += auc_val f1 += f1_val log_loss_ /= n_splits auc /= n_splits f1 /= n_splits if id_: pickle.dump((log_loss_, auc, f1), open(output / 'score.pkl', 'wb')) logging.info("The Average Log Loss is {}".format(log_loss_)) logging.info("The Average AUC is {}".format(auc)) logging.info("The Average f1 is {}".format(f1)) logging.info("Used {:.2f}s".format(time() - start_time)) return log_loss_, auc, f1
def evaluate(model, adjacency_matrix, features, labels, labels_mask, proportions, n_trials=1, random_state=None): scores = { 'proportion': proportions, 'micro': np.zeros(len(proportions)), 'macro': np.zeros(len(proportions)), 'std': np.zeros(len(proportions)), 'c': np.zeros(len(proportions)) } for i, train_ratio in enumerate(proportions): indices = np.arange(adjacency_matrix.shape[0]) labeled_indices = indices[labels_mask] not_labeled_indices = np.setdiff1d(indices, labeled_indices) std = list() for _ in range(n_trials): stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[1.0 - train_ratio, train_ratio], random_state=random_state) model.__init__() train_ind_l, test_ind_l = next(stratifier.split(labels, labels)) train_ids_nl, _test_ids_nl = sklearn.model_selection.train_test_split( not_labeled_indices, train_size=train_ratio, test_size=1 - train_ratio) train_ids = np.concatenate( [labeled_indices[train_ind_l], train_ids_nl]) # order is important, labeled first test_ids = labeled_indices[test_ind_l] adjacency_matrix_train = adjacency_matrix[ train_ids][:, train_ids].copy() adjacency_matrix_train.eliminate_zeros() features_train = [features[i] for i in train_ids] features_test = [features[i] for i in test_ids] labels_train = labels[train_ind_l] labels_test = labels[test_ind_l] model.fit(adjacency_matrix_train, features_train) vectors_train = np.array( model.get_embeddings_new(features_train))[:len(train_ind_l)] vectors_test = np.array(model.get_embeddings_new(features_test)) logger.debug( f"train: {train_ids.shape} nodes, {labels_train.sum()} labels") logger.debug( f"test: {test_ids.shape} nodes, {labels_test.sum()} labels, (+{_test_ids_nl.shape} forgotten nodes)" ) logger.debug( f"adjacency: {adjacency_matrix.shape}, {adjacency_matrix_train.shape}" ) logger.debug( f"train vectors: {vectors_train.shape}, test vectors: {vectors_test.shape}" ) mi, ma, c = train_and_predict(vectors_train, vectors_test, labels_train, labels_test) std.append(mi) scores['micro'][i] += mi / n_trials scores['macro'][i] += ma / n_trials scores['c'][i] += c / n_trials scores['std'][i] += np.array(std).std() return scores
# #Feature Engineering # sns.pairplot(dataf.sample(2)) # sns_plot.savefig("pairplot.png") # plt.clf() # Clean parirplot figure from sns # Image(filename='pairplot.png') # Show pairplot as image dataf.dtypes dataf.drop(["Patient Name"],axis=1) # print(dataf.iloc[:,52:88]) X = dataf.iloc[:,1:53] y = dataf.iloc[:,53:90] # K-fold Iterative Stratification k_fold = IterativeStratification(n_splits=40, order=1) for train_index, test_index in k_fold.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # classifier.fit(X_train, y_train) # result = classifier.predict(X_test) mlb = MultiLabelBinarizer() start=time.time() classifier = BinaryRelevance( classifier = RandomForestClassifier(n_estimators=100,criterion='gini'), require_dense = [False, True] ) classifier.fit(X_train, y_train)
def train_test_split(data, test_size=0.2, group='patient_id', labels=None, seed=None): # Author: Kristian & Tobias """ Split dataset into random train and test subsets, while keeping all the samples of one patient in the same set. Parameters: data (pd.Dataframe): The data that should be splitted. Needs to contain a column with the name specified by the group parameter. test_size (float): (Default: 0.2) A number between 0.0 and 1.0 specifying the relative size of the test set. group (string): (Default: 'patient_id') The name of the column that the data should be split by. Having multiple entries of the same value in this column will result in a split, where all of these entries will end up in the same subset. labels (list): A list of labels that are taken in consideration when performing the stratified split seed (int): (Default: None) Controlls the shuffling applied to the data before the it is split. Returns: train_data, test_data (touple of lists): Two lists consisting of the train and test split """ if not group in data.columns: raise Exception('The column ' + group + ' does not exist') if labels: stratifier = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size], random_state=seed) # split into stratified test and train set train_idx, test_idx = next(stratifier.split(data, data[labels])) train_data = data.iloc[train_idx] test_data = data.iloc[test_idx] # get group ids that are found in both sets splitted_group_ids = np.intersect1d(train_data[group].to_numpy(), test_data[group].to_numpy()) train_value_counts = train_data[group].value_counts() test_value_counts = test_data[group].value_counts() # iterate through groups and move either to test or train based on # where the value count of the group is higher for group_id in splitted_group_ids: if train_value_counts[group_id] > test_value_counts[group_id]: rows = test_data[test_data[group] == group_id] train_data = pd.concat([train_data, pd.DataFrame(rows)]) test_data = test_data[test_data[group] != group_id] else: rows = train_data[train_data[group] == group_id] test_data = pd.concat([test_data, pd.DataFrame(rows)]) train_data = train_data[train_data[group] != group_id] else: # create the group shuffle splitter shuffle_split = GroupShuffleSplit(test_size=test_size, random_state=seed) # define groups as patient ids groups = data[group].to_numpy() train_idx, test_idx = next(shuffle_split.split(data, groups=groups)) train_data = data.iloc[train_idx] test_data = data.iloc[test_idx] return (train_data, test_data)
def perform_five_fold(self, model, documents, annotations, doc_ids, pipeline_parameters): metrics = list() # store list of documents ids per fold folds = list() # turning into numpy arrays to be able to access values with index array documents_np_array = np.array(documents) annotations_np_array = np.array(annotations, dtype=object) doc_ids_np_array = np.array(doc_ids) ann_list = list() for ann in annotations_np_array: ann_list = ann_list + list([x[2] for x in ann]) # getting unique label names in annotations unique_ann_list = list(set(ann_list)) # array to store multilabel values multilabel_array = [] for ann in annotations_np_array: multilabel_array.append([unique_ann_list.index(x[2]) for x in ann]) multilabel_binarizer = MultiLabelBinarizer().fit_transform( multilabel_array) skf = IterativeStratification(n_splits=5, order=1) total_metrics = {} for train_index, test_index in skf.split(documents_np_array, multilabel_binarizer): # get annotations train and test datasets train_annotations = annotations_np_array[train_index] test_annotations = annotations_np_array[test_index] # get documents train and test datasets train_documents = documents_np_array[train_index] test_documents = documents_np_array[test_index] fold_metrics = self.perform_fold( model, [train_documents.tolist(), train_annotations.tolist()], [test_documents.tolist(), test_annotations.tolist()], pipeline_parameters) # saving docs used to train fold fold_doc_ids = doc_ids_np_array[train_index] folds.append(fold_doc_ids.tolist()) # saving fold metrics metrics.append(fold_metrics) for key in fold_metrics.keys(): if key not in total_metrics: total_metrics[key] = { "FN": 0, "FP": 0, "TP": 0, "TN": 0, "f1": 0, "precision": 0, "recall": 0, "acc": 0 } total_metrics[key][ "FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"] total_metrics[key][ "FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"] total_metrics[key][ "TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"] total_metrics[key][ "TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"] average_metrics = {} for label in total_metrics.keys(): avg_metric = {} avg_metric["FN"] = total_metrics[label]["FN"] / 5 avg_metric["FP"] = total_metrics[label]["FP"] / 5 avg_metric["TP"] = total_metrics[label]["TP"] / 5 avg_metric["TN"] = total_metrics[label]["TN"] / 5 if (avg_metric["TP"] + avg_metric["FN"]) != 0: avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FN"]) else: avg_metric["recall"] = 1.0 if (avg_metric["TP"] + avg_metric["FP"]) != 0: avg_metric["precision"] = avg_metric["TP"] / ( avg_metric["TP"] + avg_metric["FP"]) else: avg_metric["precision"] = 0.0 if (avg_metric["precision"] + avg_metric["recall"]) != 0: avg_metric["f1"] = 2 * ( avg_metric["precision"] * avg_metric["recall"]) / ( avg_metric["precision"] + avg_metric["recall"]) else: avg_metric["f1"] = 0 avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / ( avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] + avg_metric["FN"]) average_metrics[label] = avg_metric return metrics, folds, average_metrics
def get_english_annotated_tweets_sets(config, folder, only_tokenizer=False): samples_limit_for_stratification = 8 print config path = "./datasets/english_annotated_tweets/english_tweets/2_tokenized/" + config[ 'tweets_file'] annotated_tweets = json.load(open(path, 'r')) print "Total number of annotated english tweets is", len(annotated_tweets) print "Sample tweet", annotated_tweets[0] samples_distribution_domain_all = {} samples_distribution_subdomain_all = {} samples_distribution_domain = {} samples_distribution_subdomain = {} total_codes = 0 for tweet in annotated_tweets: tweet['domain_codes'] = [] tweet['subdomain_codes'] = [] for code in tweet['codes']: total_codes += 1 code = code.strip() if code not in samples_distribution_subdomain_all: samples_distribution_subdomain_all[code] = 1 else: samples_distribution_subdomain_all[code] += 1 subdomain = code if subdomain not in GLOBAL_MANIFESTOS_SUBDOMAINS: #print tweet subdomain = SUBSUB_TO_SUB[subdomain.replace('_', '.')] domain = code[0] tweet['domain_codes'].append(domain) tweet['subdomain_codes'].append(subdomain) if 'domain' not in tweet: tweet['domain'] = domain tweet['subdomain'] = subdomain if subdomain not in samples_distribution_subdomain: samples_distribution_subdomain[subdomain] = 1 else: samples_distribution_subdomain[subdomain] += 1 if domain not in samples_distribution_domain: samples_distribution_domain[domain] = 1 else: samples_distribution_domain[domain] += 1 if domain not in DOMAIN_CLASSES: print tweet if domain not in samples_distribution_domain_all: samples_distribution_domain_all[domain] = 1 else: samples_distribution_domain_all[domain] += 1 """print "Annotated tweets distribution taking into account multi-label annotation-----------------------------------" for c in DOMAIN_CLASSES: print("Domain %s; Number of samples: %s; Percentage: %.2f " % (c, samples_distribution_domain_all[c], samples_distribution_domain_all[c]/float(total_codes)*100)) for c in GLOBAL_MANIFESTOS_SUBDOMAINS: if c in samples_distribution_subdomain_all: print("Subdomain %s; Number of samples: %s; Percentage: %.2f " % (c, samples_distribution_subdomain_all[c],samples_distribution_subdomain_all[c]/float(total_codes)*100)) print "-----------------------------------------------------------------------------------------------------------" """ print "Annotated tweets distribution with multiclass-annotation" banned_codes_for_classification = [] for c in DOMAIN_CLASSES: print("Domain %s; Number of samples: %s; Percentage: %.2f " % (c, samples_distribution_domain[c], samples_distribution_domain[c] / float(len(annotated_tweets)) * 100)) for c in GLOBAL_MANIFESTOS_SUBDOMAINS: if c in samples_distribution_subdomain: print("Subdomain %s; Number of samples: %s; Percentage: %.2f " % (c, samples_distribution_subdomain[c], samples_distribution_subdomain[c] / float(len(annotated_tweets)) * 100)) if samples_distribution_subdomain[ c] < samples_limit_for_stratification: print "NOT ENOUGH SAMPLES!!!!" banned_codes_for_classification.append(c) print "BANNNED!!!!!!!!!!!!!!!!" print banned_codes_for_classification if config['party'] and config['previous_phrase']: tweets_X = [[], [], []] elif config['party'] or config['previous_phrase']: tweets_X = [[], []] else: tweets_X = [[]] tweets_y = [] party_encoder = one_hot_encoder(COUNTRY_PARTIES['english']) for tweet in annotated_tweets: if config['architecture'] == 'multi_label': class_to_pick = 'domain_codes' else: class_to_pick = config['class'] if config['class'] == 'manifestos_subdomain': if config['architecture'] == 'multi_label': class_to_pick = 'subdomain_codes' else: class_to_pick = 'subdomain' if config['architecture'] == 'multi_label': tweets_y.append( multilabel_array_to_onehot(config['class'], tweet[class_to_pick])) else: tweets_y.append(tweet[class_to_pick]) tweets_X[0].append(tweet['cleaned_text']) if config['previous_phrase']: if 'previous_tweet' in tweet: tweets_X[1].append(tweet['previous_tweet']['cleaned_text']) else: tweets_X[1].append([]) if config['party']: tweets_X[2].append(party_enconder[tweet['party']]) elif config['party']: tweets_X[1].append(party_encoder[tweet['party']]) tweets_y = [tweets_y] o_phrases = tweets_X[ 0] #Original phrases without its convertion to indexes o_eval_test_phrases = [] o_train_phrases = [] o_test_phrases = [] o_train_eval_phrases = [] if config['previous_phrase']: o_prev_phrases = tweets_X[1] o_prev_eval_test_phrases = [] o_prev_train_phrases = [] o_prev_test_phrases = [] o_prev_train_eval_phrases = [] data_X, data_y = load_json_data( config['dataset_folder'] + config['dataset'], config['previous_phrase'], config['previous_previous'], config['post_phrase'], config['party'], config['party_as_deconv'], config['class'], config['class_2'], False, False, False, config['language']) tokenizer = Tokenizer() tokenizer.fit_on_texts(data_X[0] + tweets_X[0]) if only_tokenizer: return tokenizer sequences_phrases = tokenizer.texts_to_sequences(tweets_X[0]) if config['previous_phrase']: prev_phrases = tokenizer.texts_to_sequences(tweets_X[1]) if config['party']: tweets_X_party = tweets_X[2] elif config['party']: tweets_X_party = tweets_X[1] if not config['no_padding_for_lstms']: tweets_X = pad_sequences(sequences_phrases, maxlen=config['max_phrase_length'], padding='post') if config['previous_phrase']: tweets_X_prev = pad_sequences(prev_phrases, maxlen=config['max_phrase_length'], padding='post') else: tweets_X = pad_sequences(sequences_phrases, maxlen=config['max_phrase_length']) if config['previous_phrase']: tweets_X_prev = pad_sequences(prev_phrases, maxlen=config['max_phrase_length']) tweets_X_tmp = [] tweets_X_prev_tmp = [] tweets_X_party_tmp = [] tweets_y_tmp = [] o_phrases_tmp = [] o_prev_phrases_tmp = [] if config['previous_phrase']: if not config['party']: for tweet, prev_tweet, y_label, o_phrase, o_prev_phrase in zip( tweets_X, tweets_X_prev, tweets_y[0], o_phrases, o_prev_phrases): if y_label in banned_codes_for_classification: continue tweets_X_tmp.append(tweet) tweets_X_prev_tmp.append(prev_tweet) tweets_y_tmp.append(y_label) o_phrases_tmp.append(o_phrase) o_prev_phrases_tmp.append(o_prev_phrase) else: for tweet, prev_tweet, party, y_label, o_phrase, o_prev_phrase in zip( tweets_X, tweets_X_prev, tweets_X_party, tweets_y[0], o_phrases, o_prev_phrases): if y_label in banned_codes_for_classification: continue tweets_X_tmp.append(tweet) tweets_X_prev_tmp.append(prev_tweet) tweets_X_party_tmp.append(party) tweets_y_tmp.append(y_label) o_phrases_tmp.append(o_phrase) o_prev_phrases_tmp.append(o_prev_phrase) elif config['party']: for tweet, party, y_label, o_phrase in zip(tweets_X, tweets_X_party, tweets_y[0], o_phrases): if y_label in banned_codes_for_classification: continue tweets_X_tmp.append(tweet) tweets_X_party_tmp.append(party) tweets_y_tmp.append(y_label) o_phrases_tmp.append(o_phrase) else: for tweet, y_label, o_phrase in zip(tweets_X, tweets_y[0], o_phrases): if y_label in banned_codes_for_classification: continue tweets_X_tmp.append(tweet) tweets_y_tmp.append(y_label) o_phrases_tmp.append(o_phrase) tweets_X = [tweets_X_tmp] if config['previous_phrase']: tweets_X.append(tweets_X_prev_tmp) if config['party']: tweets_X.append(tweets_X_party_tmp) tweets_y = [tweets_y_tmp] o_phrases = o_phrases_tmp if config['previous_phrase']: o_prev_phrases = o_prev_phrases_tmp tweets_train_X = generate_placeholder(len(tweets_X)) tweets_train_y = generate_placeholder(len(tweets_y)) tweets_eval_test_X = generate_placeholder(len(tweets_X)) tweets_eval_test_y = generate_placeholder(len(tweets_y)) if config['big_test']: sss_1 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=config['seed']) train_indexes, eval_test_indexes = next( sss_1.split(tweets_X[0], tweets_y[0])) elif not config['architecture'] == 'multi_label': sss_1 = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=config['seed']) train_indexes, eval_test_indexes = next( sss_1.split(tweets_X[0], tweets_y[0])) elif config['architecture'] == 'multi_label': sss_1 = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[0.3, 0.7], random_state=config['seed']) train_indexes, eval_test_indexes = next( sss_1.split(np.array(tweets_X[0]), np.array(tweets_y[0]))) #for train_indexes, eval_test_indexes in sss_1.split(tweets_X[0], tweets_y[0]): np.savetxt(folder + '/statistics/train_indexes' + config['class'] + '.out', train_indexes, delimiter=',') np.savetxt(folder + '/statistics/eval_test_indexes' + config['class'] + '.out', eval_test_indexes, delimiter=',') for train_index in train_indexes: o_train_phrases.append(o_phrases[train_index]) if config['previous_phrase']: o_prev_train_phrases.append(o_prev_phrases[train_index]) for i in range(len(tweets_X)): tweets_train_X[i].append(tweets_X[i][train_index]) for i in range(len(tweets_y)): tweets_train_y[i].append(tweets_y[i][train_index]) for eval_test_index in eval_test_indexes: o_eval_test_phrases.append(o_phrases[eval_test_index]) if config['previous_phrase']: o_prev_eval_test_phrases.append(o_prev_phrases[eval_test_index]) for i in range(len(tweets_X)): tweets_eval_test_X[i].append(tweets_X[i][eval_test_index]) for i in range(len(tweets_y)): tweets_eval_test_y[i].append(tweets_y[i][eval_test_index]) print "Number of train indexes: " + str(len(train_indexes)) print "Number of eval_test indexes: " + str(len(eval_test_indexes)) print "Number of unique indexes after the stratifying", str( len( np.unique( np.concatenate((train_indexes, eval_test_indexes), axis=0)))) tweets_eval_X = generate_placeholder(len(tweets_X)) tweets_eval_y = generate_placeholder(len(tweets_y)) tweets_test_X = generate_placeholder(len(tweets_X)) tweets_test_y = generate_placeholder(len(tweets_y)) """Split the 30% of the previous splitting into 50-50 (15-15) evaluation-test""" if config['architecture'] == 'multi_label': sss_2 = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[0.5, 0.5], random_state=config['seed']) eval_indexes, test_indexes = next( sss_2.split(np.array(tweets_eval_test_X[0]), np.array(tweets_eval_test_y[0]))) else: sss_2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=config['seed']) eval_indexes, test_indexes = next( sss_2.split(tweets_eval_test_X[0], tweets_eval_test_y[0])) #for eval_indexes, test_indexes in sss_2.split(tweets_eval_test_X[0], tweets_eval_test_y[0]): np.savetxt(folder + '/statistics/eval_indexes' + config['class'] + '.out', eval_indexes, delimiter=',') np.savetxt(folder + '/statistics/test_indexes' + config['class'] + '.out', test_indexes, delimiter=',') for eval_index in eval_indexes: for i in range(len(tweets_X)): tweets_eval_X[i].append(tweets_eval_test_X[i][eval_index]) for i in range(len(tweets_y)): tweets_eval_y[i].append(tweets_eval_test_y[i][eval_index]) for test_index in test_indexes: o_test_phrases.append(o_eval_test_phrases[test_index]) if config['previous_phrase']: o_prev_test_phrases.append(o_prev_eval_test_phrases[test_index]) for i in range(len(tweets_X)): tweets_test_X[i].append(tweets_eval_test_X[i][test_index]) for i in range(len(tweets_y)): tweets_test_y[i].append(tweets_eval_test_y[i][test_index]) #tweets_y = to_one_hot_encoding(tweets_y, get_classes_from_target_class(config['class'])) if config['architecture'] != 'multi_label': tweets_train_y[0] = to_one_hot_encoding( tweets_train_y[0], get_classes_from_target_class(config['class'])) tweets_eval_y[0] = to_one_hot_encoding( tweets_eval_y[0], get_classes_from_target_class(config['class'])) tweets_test_y[0] = to_one_hot_encoding( tweets_test_y[0], get_classes_from_target_class(config['class'])) print '------------------ PADRE LO DE LOS TUITS TRAIN -------------------------------' print tweets_train_X[0][0] print tweets_train_X[0][1] print tweets_train_X[0][2] print '------------------ PADRE LO DE LOS TUITS EVAL -------------------------------' print tweets_eval_X[0][0] print tweets_eval_X[0][1] print tweets_eval_X[0][2] print '------------------ PADRE LO DE LOS TUITS TESTS -------------------------------' print o_test_phrases[0] print o_test_phrases[1] print o_test_phrases[2] if config['big_test']: return tweets_eval_X, tweets_eval_y, tweets_test_X, tweets_test_y, tweets_train_X, tweets_train_y else: return tweets_train_X, tweets_train_y, tweets_eval_X, tweets_eval_y, tweets_test_X, tweets_test_y
class StackedPPB2(BaseEstimator, ClassifierMixin): """Stacked PPB2 model""" def __init__(self, models=["morg2-nn+nb", "morg3-nn+nb"], n_splits=5, stack_method="predict_proba", final_estimator=LogisticRegression(max_iter=1000), n_proc=8, passthrough=False): self.classifiers = [(model, PPB2(model=model, n_proc=n_proc)) for model in models] assert len(self.classifiers) == len(models) print( "building stacked PPB2 classifier", "using the following models:", ) for model_name, classifier in self.classifiers: print(model_name, classifier) print() self.n_splits = n_splits assert stack_method in {"predict_proba", "predict"} self.stack_method = stack_method self.final_estimator = final_estimator self.n_proc = n_proc self.passthrough = passthrough if passthrough: raise NotImplementedError def fit(self, X, y): """ """ assert isinstance(X, pd.Series) assert y.any(axis=0).all(), "At least one positive example is needed" assert (1 - y).any(axis=0).all(), "At least one negative example is needed" print("Fitting meta-estimators using cross-validation") if len(y.shape) == 1: print("fitting in the single-target setting") self.multi_label = False self.split = StratifiedKFold(n_splits=self.n_splits) meta_preds = np.empty(( X.shape[0], len(self.classifiers), )) else: print("fitting in the multi-target setting") print("number of targets:", y.shape[1]) self.multi_label = True self.split = IterativeStratification(n_splits=self.n_splits, order=1) self.n_targets = y.shape[1] meta_preds = np.empty(( X.shape[0], len(self.classifiers), self.n_targets, )) for i, (name, classifier) in enumerate(self.classifiers): print("fitting classifier:", name) for split_no, (train, test) in enumerate(self.split.split(X, y)): print("processing split", split_no + 1, "/", self.n_splits) classifier.fit(X[train], y[train]) if self.stack_method == "predict_proba": meta_preds[test, i] = classifier.predict_proba( X[test] ) # multi target predict probs (for positive class) else: meta_preds[test, i] = classifier.predict( X[test]) # multi target predict print("completed split", split_no + 1, "/", self.n_splits) print() print( "completed classifier", name, ) print() if not isinstance(y, np.ndarray): y = y.A if self.multi_label: print("fitting meta estimators") if not isinstance(self.final_estimator, list): self.final_estimator = [ clone(self.final_estimator) for _ in range(self.n_targets) ] for target_id in range(self.n_targets): with parallel_backend('threading', n_jobs=self.n_proc): self.final_estimator[target_id].fit( meta_preds[..., target_id], y[:, target_id]) print("completed fitting meta estimator for target", target_id + 1, "/", self.n_targets, "targets") else: print("fitting meta estimator") self.final_estimator.fit(meta_preds, y) print("completed fitting of meta estimator(s)") print() print("fitting base estimator(s) using full training set") for i, (name, classifier) in enumerate(self.classifiers): print("fitting classifier", name) classifier.fit(X, y) print( "completed classifier", name, ) print() print() return self def predict(self, X): assert isinstance(X, pd.Series) # assert (X.dtype==pd.StringDtype()), "X should be a vector of smiles" if self.multi_label: meta_preds = np.empty(( X.shape[0], len(self.classifiers), self.n_targets, )) else: meta_preds = np.empty(( X.shape[0], len(self.classifiers), )) for i, (name, classifier) in enumerate(self.classifiers): print("performing prediction with classifier:", name) assert classifier.check_is_fitted() if self.stack_method == "predict_proba": meta_preds[:, i] = classifier.predict_proba(X) else: meta_preds[:, i] = classifier.predict(X) # final estimator if self.multi_label: final_pred = np.empty((X.shape[0], self.n_targets)) for target_id in range(self.n_targets): check_is_fitted(self.final_estimator[target_id]) with parallel_backend('threading', n_jobs=self.n_proc): final_pred[:, target_id] = self.final_estimator[ target_id].predict(meta_preds[..., target_id]) return final_pred else: check_is_fitted(self.final_estimator) return self.final_estimator.predict(meta_preds) def predict_proba(self, X): assert isinstance(X, pd.Series) # assert (X.dtype==pd.StringDtype()), "X should be a vector of smiles" if self.multi_label: meta_preds = np.empty(( X.shape[0], len(self.classifiers), self.n_targets, )) else: meta_preds = np.empty(( X.shape[0], len(self.classifiers), )) for i, (name, classifier) in enumerate(self.classifiers): assert classifier.check_is_fitted() if self.stack_method == "predict_proba": meta_preds[:, i] = classifier.predict_proba(X) else: meta_preds[:, i] = classifier.predict(X) # final estimator if self.multi_label: final_pred = np.empty((X.shape[0], self.n_targets)) for target_id in range(self.n_targets): check_is_fitted(self.final_estimator[target_id]) assert self.final_estimator[target_id].classes_[1] with parallel_backend('threading', n_jobs=self.n_proc): final_pred[:, target_id] = self.final_estimator[ target_id].predict_proba(meta_preds[..., target_id])[:, 1] return final_pred else: check_is_fitted(self.final_estimator) assert self.final_estimator.classes_[1] with parallel_backend('threading', n_jobs=self.n_proc): return self.final_estimator.predict_proba(meta_preds)[:, 1] def set_n_proc(self, n_proc): self.n_proc = n_proc for _, classifier in self.classifiers: classifier.set_n_proc(n_proc)
tstdf = tstdf[['Image', 'Label']] tstdf.drop_duplicates(inplace=True) tstdf.to_csv(os.path.join(path_data, 'test.csv.gz'), index=False, compression='gzip') # Some small EDA trndf.columns trndf.iloc[:, 1:].hist(figsize=(10, 10)) trndf['Image'].drop_duplicates().shape[0] == trndf['Image'].shape[0] trndf.shape tstdf.shape trndf.head() tstdf.head() # http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html k_fold = IterativeStratification(n_splits=4, order=1, random_state=100) splits = k_fold.split(trndf[['Image']], trndf.iloc[:, 1:]) folds = [trndf['Image'].iloc[x].tolist() for (x, y) in splits] trndf['fold'] = 0 for t, f in enumerate(folds): trndf['fold'][~trndf.Image.isin(f)] = t trndf.groupby('fold')[trndf.columns.tolist()[1:-1]].sum() # Write out the training file trndf.shape trndf.to_csv(os.path.join(path_data, 'train.csv.gz'), index=False, compression='gzip')