def __init__(self, config): """ The constructor of the DataGenerator class. It loads the training labels and the images. Parameters ---------- config: dict a dictionary with necessary information for the dataloader (e.g batch size) """ cwd = os.getenv("DATA_PATH") if cwd is None: print("Set your DATA_PATH env first") sys.exit(1) self.config = config try: if self.config.augment: pass except AttributeError: self.config.augment = False # Read csv file tmp = pd.read_csv(os.path.abspath(os.path.join(cwd, 'train.csv')), delimiter=',', engine='python') # A vector of images id. image_ids = tmp["Id"] data_path = os.path.join(cwd, 'train') print(data_path) self.n = len(image_ids) # For each id sublist of the 4 filenames [batch_size, 4] self.filenames = np.asarray([[ os.path.join(cwd, 'train', id + '_' + c + '.png') for c in ['red', 'green', 'yellow', 'blue'] ] for id in image_ids]) # Labels self.labels = tmp["Target"].values # To one-hot representation of labels # e.g. before e.g. ['22 0' '12 23 0'] # after split [['22', '0'], ['12', '23', '0']] # after binarize it is one hot representation binarizer = MultiLabelBinarizer(classes=np.arange(28)) self.labels = [[int(c) for c in l.split(' ')] for l in self.labels] self.labels = binarizer.fit_transform(self.labels) # Build a validation set try: self.train_filenames, self.val_filenames,\ self.train_labels, self.val_labels = train_test_split( self.filenames, self.labels, test_size=self.config.val_split, random_state=42) except AttributeError: print('WARN: val_split not set - using 0.1') self.train_filenames, self.val_filenames,\ self.train_labels, self.val_labels = train_test_split( self.filenames, self.labels, test_size=0.1, random_state=42) print("Shape of training data: {}".format(self.train_filenames.shape)) print("Shape of training labels: {}".format(self.train_labels.shape)) # Get list of all possible images (incl. augmented if exist) data_train_folder = os.path.join(cwd, 'train') # Augment training data if specified in config file (and if possible) if self.config.augment: print("Getting augmented dataset...") filter_list = ['yellow', 'red', 'blue', 'green'] aug_train_list = [] aug_train_labels = [] for i in range(0, self.train_filenames.shape[0]): filename = self.train_filenames[i][0] \ .rsplit('/')[-1].rsplit('_')[0] print("Augmenting {}".format(filename)) temp_rot = [] temp_rev = [] counter = 1 while True: test_f = os.path.join( data_train_folder, filename + '_rot{}'.format(counter) + '_' + filter_list[0] + '.png') if os.path.isfile(test_f) is False: break temp_rot = [ os.path.join( data_train_folder, filename + '_rot{}'.format(counter) + '_' + f + '.png') for f in filter_list ] temp_rev = [ os.path.join( data_train_folder, filename + '_rev{}'.format(counter) + '_' + f + '.png') for f in filter_list ] flag = True if SKIP_CHECK is False: try: for fname in temp_rev: with open(fname, 'rb') as f: # Check header of file flag = flag and (f.read(4) == b'\x89PNG') for fname in temp_rot: with open(fname, 'rb') as f: # Check header of file flag = flag and (f.read(4) == b'\x89PNG') except IOError as e: print(e) flag = False if flag is True: aug_train_list.append(temp_rot) aug_train_labels.append(self.train_labels[i]) aug_train_list.append(temp_rev) aug_train_labels.append(self.train_labels[i]) else: print("corrupted images found") print(temp_rot) print(temp_rev) counter += 1 try: # Append list of all aug filenames to training set self.train_filenames = np.vstack( (self.train_filenames, np.asarray(aug_train_list))) self.train_labels = np.vstack( (self.train_labels, np.asarray(aug_train_labels))) # Append list of all aug filenames to 'all' set self.filenames = np.vstack( (self.filenames, np.asarray(aug_train_list))) self.labels = np.vstack( (self.labels, np.asarray(aug_train_labels))) # aug_train_list is empty (no aug data available) except ValueError: print('No augmented data found. Please augment first') # New label frequency print("New label distribution: {}".format( self.train_labels.sum(axis=0))) self.n_train = len(self.train_labels) self.n_val = len(self.val_labels) self.n = len(self.labels) if hasattr(config, 'random_state'): random_state = config.random_state else: random_state = 42 np.random.seed(random_state) if hasattr(config, 'bootstrap_size'): n_samples = int(config.bootstrap_size * self.n_train) new_indices = resample(np.arange(self.n_train), n_samples=n_samples, random_state=random_state) self.train_filenames = self.train_filenames[new_indices] self.train_labels = self.train_labels[new_indices] self.n_train = len(self.train_labels) print('Size of training set is {}'.format(self.n_train)) print('Size of validation set is {}'.format(self.n_val)) # Compute class weigths self.class_weights = (self.n_train) * np.reshape( 1 / np.sum(self.train_labels, axis=0), (1, -1)) # Number batches per epoch self.train_batches_per_epoch = int( (self.n_train - 1) / self.config.batch_size) + 1 self.val_batches_per_epoch = int( (self.n_val - 1) / self.config.batch_size) + 1 self.all_batches_per_epoch = int( (self.n - 1) / self.config.batch_size) + 1
def pedicting_tag(request): print 'inside predicting tag' class lemmatokenizer(object): def __init__(self): self.stemmer = SnowballStemmer('english') self.token_pattern = r"(?u)\b\w\w+\b" # self.wnl = WordNetLemmatizer() def __call__(self,doc): # here, doc is one string sentence token_pattern = re.compile(self.token_pattern) return [self.stemmer.stem(t) for t in token_pattern.findall(doc)] # return lambda doc: token_pattern.findall(doc) # return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) # In[9]: tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3)) le = preprocessing.LabelEncoder() le.fit(y_labels) d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']]) d_set.head() new_y_labels = d_set['label_num'].values.tolist() mlb = MultiLabelBinarizer() mlb.fit(new_y_labels) y_tag_dtm = mlb.transform(new_y_labels) y_tag_dtm.shape # In[14]: X_labels = d_set['title'].values.tolist() # print (X_labels) # In[15]: vect_title.fit(X_labels) X_title_dtm = vect_title.transform(X_labels) X_title_dtm from sklearn.decomposition import PCA pca = PCA(n_components=100).fit(X_title_dtm.toarray()) pca_samples = pca.transform(X_title_dtm.toarray()) pca_df = pd.DataFrame(np.round(pca_samples,4)) print (pca_df.head()) # In[ ]: # In[17]: new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names()) new_df.shape d = collections.Counter(vect_title.get_feature_names()) new_df['target_list'] = [i for i in y_tag_dtm] tfidf_vect_title.fit(X_labels) X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels) X_title_dtm_tfidf # In[23]: new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) # In[24]: new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] # In[25]: y = new_df_of_tfidf['target_list'] X = new_df_of_tfidf.drop('target_list',axis=1) X = np.array(X.values.tolist()) # it will convert list to numpy ndarray y = np.array(y.values.tolist()) # In[28]: # print (X[0]) # In[29]: pca_X = PCA(n_components=200).fit_transform(X) pca_X = np.round(pca_X,4) pca_y = PCA(n_components=50).fit_transform(y) pca_y = np.round(pca_y,4) # In[30]: print (pca_y) # In[31]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # In[32]: # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1) # In[ ]: # In[33]: # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))]) # just to for Pipeline example knn_clf = KNeighborsClassifier(n_neighbors=5) # mnb_clf = MultinomialNB() # not working for MultiLabelinput # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0)) # time_pass_y = np.random.randint(2,size=(2838,1)) # produce ndarray of size 2838 X 1 knn_clf.fit(X_train, y_train) # mnb_clf.fit(X_train, y_train) knn_pred = knn_clf.predict(X_test) # mnb_pred = mnb_clf.predict(X_test) # svc_pred = svc_clf.predict(X_test) # In[34]: knn_clf.score(X_test, y_test) # In[53]: from sklearn import metrics knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples') # on full data-set knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples') knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples') # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100]) #throwing error mnb_clf can't work on multilabel O/P # In[36]: metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100]) # I think it's same as calculating hamming_score # In[37]: # print (knn_report) # its type is str print "For knn_clf (KNearestNeighbours) : " print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore print "f1_score : ",knn_f1_score print "avg. precision_score : ",knn_avg_precision_score print "roc_auc_score : ",knn_roc_auc_score # In[38]: # def does_test_tag_match(d, list_of_tags): # no need for this function # In[39]: test = ["how to use policy iteration in ml ?"] # test = ["what is lstm ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf # print (test_dtm.toarray()[0]) status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = knn_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans) forest = RandomForestClassifier(n_estimators=100, random_state=0) rf_clf = MultiOutputClassifier(forest, n_jobs=-1) rf_clf.fit(X_train, y_train) rf_pred = rf_clf.predict(X_test) # In[41]: rf_clf # In[42]: metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100]) # I think it's same as calculating hamming_score # In[43]: rf_clf.score(X_test, y_test) rf_report = metrics.classification_report(y_test[:100], rf_pred[:100]) rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples') rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples') # on full data-set rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples') rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') # In[47]: # print (rf_report) print "For rf_clf (RandomForest) : " print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore print "f1_score : ",rf_f1_score print "avg. precision_score : ",rf_avg_precision_score print "roc_auc_score : ",rf_roc_auc_score # test = ["what is reinforcement learning ?"] test = ["what is ai,lstm and data visualization ?"] # test_dtm = vect_title.transform(test) # without tfidf test_dtm = tfidf_vect_title.transform(test) # with tfidf status = False for i in test_dtm.toarray()[0]: if (i!=0): status = True break ans = rf_clf.predict(test_dtm.toarray()) ans = mlb.inverse_transform(ans) if (len(ans[0])==0 or status==False): print ("sorry, we can't predict your category!!!") else: ans = le.inverse_transform(ans) print (ans)
Imports """ import pandas as pd import sklearn from sklearn.preprocessing import MultiLabelBinarizer import pyarrow as pa import pyarrow.parquet as pq """ global variables """ #need to coordinate with Jason on how this is created and how to refactor using the API instead of a document. #it is a dataframe with the entities infile = 'jason_mimc-554_new.csv' outputfile = 'tpot_prep-diagnosis_names_one_hot_encoded.parquet' mlb = MultiLabelBinarizer() def load_dataframe(): with open(infile) as json_file: df = pd.read_csv(infile) return df def write_dataframe(df): table = pa.Table.from_pandas(df) pq.write_table(table, outputfile) def diagnoses_one_hot_encoding(): df = load_dataframe() #create boolean mask matched non NaNs values mask = df['diagnosis'].notnull()
def main(): parser = argparse.ArgumentParser('Build a model for a classifier') parser.add_argument('--categoriesFile',required=True,type=str,help='Category list file') parser.add_argument('--params',required=True,type=str,help='JSON string with parameters') parser.add_argument('--useTestSet',action='store_true',help='Whether to use the test set instead of the validation set') parser.add_argument('--inJSON',required=True,type=str,help='Filename of JSON documents') args = parser.parse_args() print("Running with --params %s" % args.params) params = json.loads(args.params) with open(args.inJSON) as f: documents = json.load(f) with open(args.categoriesFile) as f: categories = [ line.strip() for line in f ] #test_docs = [ d for d in documents if 'phase4' in d['annotations'] ] #documents = [ d for d in documents if not 'phase4' in d['annotations'] ] #viruses = {'SARS-CoV-2','SARS-CoV','MERS-CoV'} #documents = [ d for d in documents if any(entity['type'] == 'Virus' for entity in d['entities']) or any( v in d['annotations'] for v in viruses) ] train_docs = [ d for d in documents if len(d['annotations']) > 0 and d['phase'] != 'testset' ] test_docs = [ d for d in documents if d['phase'] == 'testset' ] #other_docs = [ d for d in documents if len(d['annotations']) == 0 ] toRemoveFromTraining = {'RemoveFromCorpus?','NotAllEnglish','NotRelevant','FixAbstract'} train_docs = [ d for d in train_docs if not any (f in d['annotations'] for f in toRemoveFromTraining) ] if not args.useTestSet: train_docs, test_docs = train_test_split(train_docs, test_size=0.25, random_state=42) train_categories = [ [ a for a in d['annotations'] if a in categories ] for d in train_docs ] test_categories = [ [ a for a in d['annotations'] if a in categories ] for d in test_docs ] encoder = MultiLabelBinarizer() train_targets = encoder.fit_transform(train_categories) test_targets = encoder.fit_transform(test_categories) target_names = encoder.classes_ assert len(target_names) == len(categories) print("len(train_docs):",len(train_docs)) print("len(test_docs):",len(test_docs)) print("class balance for train:", 100*sum(train_targets)/len(train_targets)) print("class balance for test:", 100*sum(test_targets)/len(test_targets)) sys.stdout.flush() clf = DocumentClassifier(params) print('train_targets.shape=',train_targets.shape) sys.stdout.flush() clf.fit(train_docs, train_targets, target_names) predictions = clf.predict(test_docs) print('predictions.shape=',predictions.shape) sys.stdout.flush() results = {} all_tn, all_fp, all_fn, all_tp = 0,0,0,0 all_precisions, all_recalls, all_f1_scores = [],[],[] for i,label in enumerate(target_names): gold_for_label = test_targets[:,i] predictions_for_label = predictions[:,i] > 0.5 tn, fp, fn, tp = sklearn.metrics.confusion_matrix(gold_for_label, predictions_for_label).ravel() tn, fp, fn, tp = map(int, [tn, fp, fn, tp]) all_tn += tn all_fp += fp all_fn += fn all_tp += tp precision = sklearn.metrics.precision_score(gold_for_label,predictions_for_label) recall = sklearn.metrics.recall_score(gold_for_label,predictions_for_label) f1_score = sklearn.metrics.f1_score(gold_for_label,predictions_for_label) all_precisions.append(precision) all_recalls.append(recall) all_f1_scores.append(f1_score) print(f"{label}\t{precision}\t{recall}\t{f1_score}") sys.stdout.flush() results[label] = {'tn':tn,'fp':fp,'fn':fn,'tp':tp,'precision':precision,'recall':recall,'f1_score':f1_score} micro_precision = all_tp / (all_tp + all_fp) if (all_tp + all_fp) > 0 else 0 micro_recall = all_tp / (all_tp + all_fn) if (all_tp + all_fn) > 0 else 0 micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0 macro_precision = sum(all_precisions) / len(all_precisions) macro_recall = sum(all_recalls) / len(all_recalls) macro_f1 = sum(all_f1_scores) / len(all_f1_scores) results['MICRO'] = {'tn':all_tn,'fp':all_fp,'fn':all_fn,'tp':all_tp,'precision':micro_precision,'recall':micro_recall,'f1_score':micro_f1} results['MACRO'] = {'precision':macro_precision,'recall':macro_recall,'f1_score':macro_f1} print("-"*30) print(f"MICRO\t{micro_precision}\t{micro_recall}\t{micro_f1}") print(f"MACRO\t{macro_precision}\t{macro_recall}\t{macro_f1}") print("-"*30) output = {'params':params, 'results':results} print(json.dumps(output)) print("Done")
print("Calculating Daylight fingerprint...") data['fingerprint'] = data['mol'].apply(ft.daylight_fingerprint) data['fingerprint'] = data['fingerprint'].apply( ft.daylight_fingerprint_padding) print("Daylight fingerprint calculation done.") if args.featurizer == 'ecfp': print("Calculating ECFP...") data['fingerprint'] = data['mol'].apply(ft.get_ecfp) print("ECFP calculation done.") # Input (x) and label (y) X = data['fingerprint'] X = np.array(np.stack(X), dtype=float) mlb = MultiLabelBinarizer().fit(data['agrochemical']) Y = mlb.transform(data['agrochemical']) # Build neural network model layers_dim = [X.shape[1], 512, 128, 16, 4, Y.shape[1]] activation = ['relu', 'relu', 'relu', 'relu', 'sigmoid'] image_name = filename[:filename.rfind('.')] + '.png' training_acc, training_loss, validation_acc, validation_loss = \ model_func.plot_nn_loss_against_epoch(X, Y, layers_dim, activation, args.epochs, image_name, loss=args.loss, optimizer=args.optimizer) print("Number of epochs for maximum training accuracy:", np.argmax(training_acc)) print("Number of epochs for minimum training loss:", np.argmin(training_loss))
def get_word_vec(df, train_or_test='train'): mlb = MultiLabelBinarizer() wordVecs = mlb.fit_transform(df['word_list']) wordVecs = [[e] for e in wordVecs] return mlb, pd.DataFrame(wordVecs)
Labels = [] for i in range(T): labels = map(int, input().split(' ')) RawData.append(input()) Labels.append(labels) Queries = [] for i in range(E): Queries.append(input()) RawData.extend(Queries) X = CVectorizer.fit_transform(RawData) Xtf = TfIdfVectorizer.fit_transform(X) del X MLB = MultiLabelBinarizer() Yt = MLB.fit_transform(Labels) XtfTrain = Xtf[0:T] XtfTest = Xtf[T:] Clf = OneVsRestClassifier(LinearSVC(loss='l1', class_weight={ 1: 100, 0: 1 })).fit(XtfTrain, Yt) Classes = list(MLB.classes_) for xTest in XtfTest: y = Clf.decision_function(xTest) y1 = list(y[0]) c1 = Classes lbls = [x for (y, x) in sorted(zip(y1, c1))][-10:] list.reverse(lbls)
JOIN section_content_75pct t2 ON t1.file_id=t2.file_id AND t1.section_id=t2.section_id """ df = pandas.read_sql_query(con=conn, sql=sql_text) df_randomized_order = df.sample(frac=1, random_state=rng_seed) heading_plus_content_corpus = df_randomized_order[ 'abstracted_heading_plus_content'] content_corpus = df_randomized_order['content_text_w_o_tags'] heading_text_corpus = df_randomized_order['heading_text'] url_corpus = df_randomized_order['url'] # Class '2' has been merged into class '1' label_set = ['-', '1', '3', '4', '5', '6', '7', '8'] labels = [ str(x).split(',') for x in df_randomized_order['section_code'] ] mlb = MultiLabelBinarizer(classes=label_set) labels_matrix = mlb.fit_transform(labels) tfidf = TfidfVectorizer(ngram_range=(1, 1), analyzer='word', stop_words='english') tfidfX = tfidf.fit_transform(heading_plus_content_corpus) logging.info('tfidf matrix shape: ') logging.info(tfidfX.shape) features_tfidf = pandas.DataFrame(tfidfX.todense()) # Assign column names to make it easier to print most useful features later features_tfidf.columns = tfidf.get_feature_names() features_combined = features_tfidf
print(i) valid_X = np.array(valid_X) np.save(data_path, valid_X) # process label print("label preprocessing") train_y = [] for train_id in train_list: train_y.append(get_labels(train_id)) valid_y = [] for valid_id in valid_list: valid_y.append(get_labels(valid_id)) encoder = MultiLabelBinarizer() encoder.fit(train_y + valid_y) train_y_onehot = encoder.transform(train_y) valid_y_onehot = encoder.transform(valid_y) train_y_onehot = np.delete(train_y_onehot, [2, 3, 5, 6, 7, 10, 12], 1) # delete out 8 and "No Finding" column valid_y_onehot = np.delete(valid_y_onehot, [2, 3, 5, 6, 7, 10, 12], 1) # delete out 8 and "No Finding" column with open(data_path + "/train_y_onehot.pkl", "wb") as f: pickle.dump(train_y_onehot, f) with open(data_path + "/valid_y_onehot.pkl", "wb") as f: pickle.dump(valid_y_onehot, f) with open(data_path + "/label_encoder.pkl", "wb") as f: pickle.dump(encoder, f)
def _label_matrix(tr_target, te_target): mlb = MultiLabelBinarizer(sparse_output=True) ytr = mlb.fit_transform(tr_target) yte = mlb.transform(te_target) print(mlb.classes_) return ytr, yte
from os.path import dirname, join import sys from languageflow.flow import Flow from languageflow.model import Model from languageflow.transformer.tfidf import TfidfVectorizer from sklearn.linear_model import LogisticRegression from languageflow.validation.validation import TrainTestSplitValidation from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import MultiLabelBinarizer from load_data import load_dataset if __name__ == '__main__': data_file = join(dirname(dirname(dirname(dirname(__file__)))), "data", "fb_bank", "corpus", "train.xlsx") X, y = load_dataset(data_file) flow = Flow() flow.data(X, y) transformer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, min_df=8) flow.transform(MultiLabelBinarizer()) flow.transform(transformer) flow.add_model( Model(OneVsRestClassifier(LogisticRegression()), "LogisticRegression")) flow.set_validation(TrainTestSplitValidation(test_size=0.1)) flow.train() flow.export(model_name="LogisticRegression", export_folder="model")
def evaluate(): warnings.filterwarnings("ignore", category=UserWarning) f_test = open('data/raw/Test.csv') lines_test = csv.reader(f_test) f_train = open('data/raw/Train.csv') lines_train = csv.reader(f_train) #next(lines) #to skip the header of the csv true_ans_test = [] true_ans_train = [] train_title_feature = np.load('data/vectorized/Train_title.npy') train_summary_feature = np.load('data/vectorized/Train_summary.npy') test_title_feature = np.load('data/vectorized/Test_title.npy') test_summary_feature = np.load('data/vectorized/Test_summary.npy') for line in lines_test: source_uri = line[4] true_ans_test.append(source_uri.split(' ')) for line in lines_train: source_uri = line[4] true_ans_train.append(source_uri.split(' ')) f = open('reports/Test.ans') lines = csv.reader(f) #next(lines) #to skip the header of the csv pred_ans = [] for line in lines: pred_ans.append(line[0].split(' ')) f.close() classes = ['nytimes', 'indiatimes', 'washingtonpost'] mlb = MultiLabelBinarizer(classes) pred_ans_b = mlb.fit_transform(pred_ans) true_ans_b = mlb.transform(true_ans_test) print('\n\nMLB:') Sub_accuracy_score = accuracy_score(true_ans_b, pred_ans_b) Sub_accuracy_score = str(round(Sub_accuracy_score, 3)) print('\nSubset Accuracy: ' + Sub_accuracy_score) hamming_score = hamming_loss(true_ans_b, pred_ans_b) hamming_score = str(round(hamming_score, 3)) print('\nHamming Loss: ' + hamming_score + '\n\n') strategies = ['stratified', 'uniform'] X_test = np.squeeze( np.concatenate((test_title_feature, test_summary_feature), 2)) y_test = true_ans_test X_train = np.squeeze( np.concatenate((train_title_feature, train_summary_feature), 2)) y_train = true_ans_train test_scores = [] for s in strategies: dclf = DummyClassifier(strategy=s, random_state=0) dclf = dclf.fit(X_train, y_train) pred_ans = [] ans = dclf.predict(X_test) for a in ans: pred_ans.append(a) pred_ans_b = mlb.fit_transform(pred_ans) print('\n\n' + s + ':') Sub_accuracy_score = accuracy_score(true_ans_b, pred_ans_b) Sub_accuracy_score = str(round(Sub_accuracy_score, 3)) print('\nSubset Accuracy: ' + Sub_accuracy_score) hamming_score = hamming_loss(true_ans_b, pred_ans_b) hamming_score = str(round(hamming_score, 3)) print('\nHamming Loss: ' + hamming_score) print('\n\n')
# X1 = [ # [1,2,3,4,5], # [1,1,2,2,3], # [2,2,3,4,5], # ] # # y1 = [ # [1,2,3], # [1,2], # [4,1], # ] # # classifier = OneVsRestClassifier(SVC(class_weight='auto')) # classifier.fit(X1, y1) # # y2 = classifier.predict(X2) from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import MultiLabelBinarizer c = OneVsRestClassifier(SVC()) X = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 1]] Y = [[1], [2], [3], [1, 3]] binarizer = MultiLabelBinarizer().fit(Y) yyy = binarizer.transform(Y) estimator = c.fit(X, yyy) result = estimator.predict([[1, 0, 0]]) print(binarizer.inverse_transform(result)) # hoge = MultiLabelBinarizer().inverse_transform(result) # print(hoge)
def __init__(self, vectors, clf): self.embeddings = vectors self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True)
def generate_vectors(train_url, test_url=None, column='article', trans_type=None, max_n=1, min_df=1, max_df=1.0, max_features=1, sublinear_tf=True, balanced=False, re_weight=0, verbose=False, drop_words=0, multilabel_out=False, label_col='subjects', only_single=True, shuffle=True, apply_fun=None): """ generate X, y, X_test vectors with csv(with header) url use pandas and CountVectorizer Args: train_url: url to train csv test_url: url to test csv, set to None if not need X_test column: column to use as feature trans_type: specific transformer, {'dc','idf', 'hashing'} max_n: max_n for ngram_range min_df: min_df for CountVectorizer max_df: max_df for CountVectorizer max_features: max_features for CountVectorizer sublinear_tf: sublinear_tf for default TfdcTransformer balanced: balanced for default TfdcTransformer, for idf transformer, it is use_idf re_weight: re_weight for TfdcTransformer verbose: True to show more information drop_words: randomly delete some words from sentences multilabel_out: return y as multilabel format label_col: col name of label only_single: only keep records of single label shuffle: re sample train data apply_fun: callable to be applied on label column Returns: X, y, X_test """ verbose and print("loading '%s' level data from %s with pandas" % (column, train_url)) train_df = pd.read_csv(train_url) if shuffle: train_df = train_df.sample(frac=1) if only_single: train_df = train_df[train_df['subjects'].apply(lambda x: len(x) < 2)] # vectorizer s_time = time() analyzer = 'word' if column == 'word_seg' else 'char' vec = CountVectorizer(analyzer=analyzer, ngram_range=(1, max_n), min_df=min_df, max_df=max_df, max_features=max_features, token_pattern='\w+') verbose and print("finish loading, vectorizing") verbose and print("vectorizer params:", vec.get_params()) sequences = train_df[column] # delete some words randomly for i, row in enumerate(sequences): if drop_words <= 0: break if np.random.ranf() < drop_words: row = np.array(row.split()) sequences.at[i] = ' '.join(row[np.random.ranf(row.shape) > 0.35]) X = sequences if trans_type == 'hashing' else vec.fit_transform(sequences) e_time = time() verbose and print("finish vectorizing in %.3f seconds, transforming" % (e_time - s_time)) # transformer if trans_type is None or trans_type == 'idf': trans = TfidfTransformer(sublinear_tf=sublinear_tf, use_idf=balanced) elif trans_type == 'dc': trans = TfdcTransformer(sublinear_tf=sublinear_tf, balanced=balanced, re_weight=re_weight) else: trans = HashingVectorizer(analyzer=analyzer, ngram_range=(1, max_n), n_features=max_features, token_pattern='\w+', binary=not balanced) verbose and print(trans_type, "transformer params:", trans.get_params()) if multilabel_out: mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_df[label_col].apply(str.split)) verbose and print("multilabel columns:\n", mlb.classes_) else: y = train_df[label_col].apply(apply_fun).values if apply_fun is not None \ else train_df[label_col].values X = trans.fit_transform(X, y) X_test = None if test_url: verbose and print("transforming test set") test_df = pd.read_csv(test_url) X_test = test_df[column] if trans_type == 'hashing' else vec.transform( test_df[column]) X_test = trans.transform(X_test) s_time = time() verbose and print("finish transforming in %.3f seconds\n" % (s_time - e_time)) return X, y, X_test
if args.algo == 'pane': Xf = utils.load_emd(path_emb + ".f", n, d / 2, n - 1) Xb = utils.load_emd(path_emb + ".b", n, d / 2, n - 1) Xf = preprocessing.normalize(Xf, norm='l2', axis=1) Xb = preprocessing.normalize(Xb, norm='l2', axis=1) X = np.hstack([Xf, Xb]) print(X.shape) else: X = utils.load_emd(path_emb, n, d, n - 1) path_label = settings.DATA_INFO[args.data]['path'] + 'labels.txt' maf1 = [] mif1 = [] if args.multi: y = utils.load_label(path_label, n) X, y = filter(X, y) y = MultiLabelBinarizer(sparse_output=True).fit_transform(y) else: y = utils.read_cluster(n, path_label) for ratio in [0.9, 0.7, 0.5, 0.3, 0.1]: print("labelled data ratio:" + str(1 - ratio)) macro_f1_avg, micro_f1_avg = eval(X, y, ratio, args.multi, 3) maf1.append(macro_f1_avg) mif1.append(micro_f1_avg) print("macro-f1=%f, micro-f1=%f", macro_f1_avg, micro_f1_avg) print(maf1) print(mif1)
def get_tags_vec(df, train_or_test='train'): mlb = MultiLabelBinarizer() tagVecs = mlb.fit_transform(df['tags_values']) tagVecs = [[e] for e in tagVecs] return mlb, pd.DataFrame(tagVecs)
def perform_five_fold(self, model, documents, annotations, doc_ids, pipeline_parameters): metrics = list() # store list of documents ids per fold folds = list() # turning into numpy arrays to be able to access values with index array documents_np_array = np.array(documents) annotations_np_array = np.array(annotations, dtype=object) doc_ids_np_array = np.array(doc_ids) ann_list = list() for ann in annotations_np_array: ann_list = ann_list + list([x[2] for x in ann]) # getting unique label names in annotations unique_ann_list = list(set(ann_list)) # array to store multilabel values multilabel_array = [] for ann in annotations_np_array: multilabel_array.append([unique_ann_list.index(x[2]) for x in ann]) multilabel_binarizer = MultiLabelBinarizer().fit_transform( multilabel_array) skf = IterativeStratification(n_splits=5, order=1) total_metrics = {} for train_index, test_index in skf.split(documents_np_array, multilabel_binarizer): # get annotations train and test datasets train_annotations = annotations_np_array[train_index] test_annotations = annotations_np_array[test_index] # get documents train and test datasets train_documents = documents_np_array[train_index] test_documents = documents_np_array[test_index] fold_metrics = self.perform_fold( model, [train_documents.tolist(), train_annotations.tolist()], [test_documents.tolist(), test_annotations.tolist()], pipeline_parameters) # saving docs used to train fold fold_doc_ids = doc_ids_np_array[train_index] folds.append(fold_doc_ids.tolist()) # saving fold metrics metrics.append(fold_metrics) for key in fold_metrics.keys(): if key not in total_metrics: total_metrics[key] = { "FN": 0, "FP": 0, "TP": 0, "TN": 0, "f1": 0, "precision": 0, "recall": 0, "acc": 0 } total_metrics[key][ "FN"] = total_metrics[key]["FN"] + fold_metrics[key]["FN"] total_metrics[key][ "FP"] = total_metrics[key]["FP"] + fold_metrics[key]["FP"] total_metrics[key][ "TP"] = total_metrics[key]["TP"] + fold_metrics[key]["TP"] total_metrics[key][ "TN"] = total_metrics[key]["TN"] + fold_metrics[key]["TN"] average_metrics = {} for label in total_metrics.keys(): avg_metric = {} avg_metric["FN"] = total_metrics[label]["FN"] / 5 avg_metric["FP"] = total_metrics[label]["FP"] / 5 avg_metric["TP"] = total_metrics[label]["TP"] / 5 avg_metric["TN"] = total_metrics[label]["TN"] / 5 if (avg_metric["TP"] + avg_metric["FN"]) != 0: avg_metric["recall"] = avg_metric["TP"] / (avg_metric["TP"] + avg_metric["FN"]) else: avg_metric["recall"] = 1.0 if (avg_metric["TP"] + avg_metric["FP"]) != 0: avg_metric["precision"] = avg_metric["TP"] / ( avg_metric["TP"] + avg_metric["FP"]) else: avg_metric["precision"] = 0.0 if (avg_metric["precision"] + avg_metric["recall"]) != 0: avg_metric["f1"] = 2 * ( avg_metric["precision"] * avg_metric["recall"]) / ( avg_metric["precision"] + avg_metric["recall"]) else: avg_metric["f1"] = 0 avg_metric["acc"] = (avg_metric["TP"] + avg_metric["TN"]) / ( avg_metric["TP"] + avg_metric["TN"] + avg_metric["FP"] + avg_metric["FN"]) average_metrics[label] = avg_metric return metrics, folds, average_metrics
lst[i] = 'other' if len(lst) <= 0: lst = ['other'] return lst data['country'] = data['country'].apply(change_country_name) # data['genre'] = data['listed_in'].apply(split_comma) data['genre'] = data['genre'].apply(lambda row: list(set(row))) print("nontext features processing finished") # ----------- Convert to One-Hot representation from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(sparse_output=True) data2 = data[['type', 'title', 'country', 'rating', 'words', 'genre']] data2 = data2.join( pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data.pop('genre')), index=data.index, columns=mlb.classes_)) data2 = data2.join( pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data2.pop('country')), index=data.index, columns=mlb.classes_)) data2 = data2.join( pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(data2.pop('rating')), index=data.index, columns=mlb.classes_))
import pandas as pd import preprocess import tensorflow as tf from sklearn.preprocessing import MultiLabelBinarizer from sklearn.utils import shuffle data = pd.read_csv("myFP_217_D2.csv", header=None) D2 = preprocess.get_data(data) X = preprocess.get_X(D2) # y = pd.DataFrame(preprocess.get_target(D2)) value = preprocess.get_target(D2) value = MultiLabelBinarizer().fit_transform(value) y = pd.DataFrame(value) X, y = shuffle(X, y, random_state=0) X_train, X_test = X[:int((0.8 * len(X)))], X[int((0.8 * len(X))):] y_train, y_test = y[:int((0.8 * len(X)))], y[int((0.8 * len(X))):] def cnn1(): def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial)
del psg_list, all_psg # load datasets from cPickle training_ds = cPickle.load(open(Pickle_Dir + "TrainingDataset.pickle", "r")) testing_ds = cPickle.load(open(Pickle_Dir + "TestingDataset.pickle", "r")) if False: error_analysis_ds = cPickle.load( open(Pickle_Dir + "ErrorAnalysisDataset.pickle", "r")) print training_ds.size, testing_ds.size # Decision Tree Classification clf = MultiLabelDecisionTreeClassifier(fv_dimension=len( training_ds.fv_nparray[0, :]), min_split_entropy_threshold=0.0) # Preprocessing - MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=training_ds.classes_, sparse_output=False) training_ds.cl_indicator_matrix = mlb.fit_transform( training_ds.class_label_lists) print training_ds.size, testing_ds.size, len(training_ds.classes_) # Model Training t0 = time() clf.fit(training_ds.fv_nparray, training_ds.cl_indicator_matrix) training_time = time() - t0 print "[main.py] Offline cost for training from %d samples is %.04f seconds." \ % (len(training_ds.fv_nparray), training_time) # # Accuracy & Efficient Measurement # metrics = MultiLabelClassifierMetricsCalculator() # t0 = time() # for i in xrange(len(testing_ds.fv_nparray)): # prediction = clf.predict(testing_ds.fv_nparray[i, :])
results.append(result) return pd.DataFrame(results, index=labels) if __name__ == '__main__': # if True: simple model has given equal parameters for all OvR estimators # if False: grid search used to find best parameters per estimator simple_model = False # load data DATA_PATH = Path(__file__).parent / 'data' X_train, train_labels = load_data(DATA_PATH / 'train.txt') X_test, test_labels = load_data(DATA_PATH / 'test.txt') # binarize target labels multi_label_binarizer = MultiLabelBinarizer() y_train = multi_label_binarizer.fit_transform(train_labels) y_test = multi_label_binarizer.transform(test_labels) labels = multi_label_binarizer.classes_ # set initial seed SEED = 23249425 random.seed(SEED) # set 10 seeds for randomization of models local_seeds = {run: random.randint(1, 2**32 - 1) for run in range(10)} # get best parameters for OvR Model ovr_best_params = None if not simple_model: ovr_best_params = ovr_hyperparameter_optimization( X_train, y_train, labels, SEED)
stitch2se, se2name_mono = load_mono_se() mono_se_dict = { val: i for i, val in enumerate(sorted(se2name_mono.keys(), reverse=False)) } # create lists with pairs and se of each pair --------------------- labels = list() pairs = list() for combo in sorted(combo2se.keys()): labels.append(list(combo2se[combo])) pairs.append(list(combo2stitch[combo])) # one-hot-encode the target mlb_y = MultiLabelBinarizer() y = mlb_y.fit_transform(labels) # y_sparse = sparse.csr_matrix(y) del labels, combo2stitch, combo2se, se2name_mono # transform the dataset ------------------------------------ x = list() for pair in pairs: x.append([stitch2se.get(item, item) for item in pair]) left = [list(x[i][0]) for i in range(len(x))] right = [list(x[i][1]) for i in range(len(x))] del x, pairs, pair mlb = MultiLabelBinarizer()
(train['TicketPrefix'].unique(), test['TicketPrefix'].unique()), axis=0) ticket_prefixes = np.unique(ticket_prefixes) ticket_prefixes # Then, we binarize nominal features and impute missing values. The final feature vector is shown below. # In[74]: from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import LabelEncoder, Normalizer, MultiLabelBinarizer, Imputer train['Embarked'].fillna('n/a', inplace=True) test['Embarked'].fillna('n/a', inplace=True) mapper = DataFrameMapper([('Sex', LabelEncoder()), (['Pclass'], MultiLabelBinarizer()), (['Age'], [Imputer(), Normalizer()]), ('SibSp', None), ('Parch', None), (['Fare'], [Imputer(), Normalizer()]), (['Cabin'], MultiLabelBinarizer()), (['Title'], MultiLabelBinarizer(classes=all_titles)), (['Embarked'], MultiLabelBinarizer()), (['TicketPrefix'], MultiLabelBinarizer(classes=ticket_prefixes))]) training_instances = mapper.fit_transform(train) training_labels = np.array(train['Survived']) print("X dimensions:") print(mapper.transformed_names_) # # Evaluating Classifiers
def eval(self, model, return_preds_and_labels=False): """ Performs evaluation on a given model. :param model: The model on which to perform evaluation :type model: AdaptiveModel :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the :type return_preds_and_labels: bool :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics and reports generated during evaluation. :rtype all_results: list of dicts """ model.eval() # init empty lists per prediction head loss_all = [0 for _ in model.prediction_heads] preds_all = [[] for _ in model.prediction_heads] probs_all = [[] for _ in model.prediction_heads] label_all = [[] for _ in model.prediction_heads] ids_all = [[] for _ in model.prediction_heads] passage_start_t_all = [[] for _ in model.prediction_heads] for step, batch in enumerate( tqdm(self.data_loader, desc="Evaluating", mininterval=10) ): batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = model.forward(**batch) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) probs = model.logits_to_probs(logits=logits, **batch) labels = model.prepare_labels(**batch) # stack results of all batches per prediction head for head_num, head in enumerate(model.prediction_heads): loss_all[head_num] += np.sum(to_numpy(losses_per_head[head_num])) preds_all[head_num] += list(to_numpy(preds[head_num])) probs_all[head_num] += list(to_numpy(probs[head_num])) label_all[head_num] += list(to_numpy(labels[head_num])) if head.model_type == "span_classification": ids_all[head_num] += list(to_numpy(batch["id"])) passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"])) # Evaluate per prediction head all_results = [] for head_num, head in enumerate(model.prediction_heads): multilabel = head.model_type == "multilabel_text_classification" if multilabel: # converting from string preds back to multi-hot encoding from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=head.label_list) # TODO check why .fit() should be called on predictions, rather than on labels preds_all[head_num] = mlb.fit_transform(preds_all[head_num]) label_all[head_num] = mlb.transform(label_all[head_num]) if hasattr(head, 'aggregate_preds'): # Needed to convert NQ ids from np arrays to strings ids_all_str = [x.astype(str) for x in ids_all[head_num]] ids_all_list = [list(x) for x in ids_all_str] head_ids = ["-".join(x) for x in ids_all_list] preds_all[head_num], label_all[head_num] = head.aggregate_preds(preds=preds_all[head_num], labels=label_all[head_num], passage_start_t=passage_start_t_all[head_num], ids=head_ids) result = {"loss": loss_all[head_num] / len(self.data_loader.dataset), "task_name": head.task_name} result.update( compute_metrics(metric=head.metric, preds=preds_all[head_num], probs=probs_all[head_num], labels=label_all[head_num], multilabel=multilabel) ) # Select type of report depending on prediction head output type if self.report: try: result["report"] = compute_report_metrics(head, preds_all[head_num], label_all[head_num]) except: logger.error(f"Couldn't create eval report for head {head_num} with following preds and labels:" f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}") result["report"] = "Error" if return_preds_and_labels: result["preds"] = preds_all[head_num] result["labels"] = label_all[head_num] result["probs"] = probs_all[head_num] all_results.append(result) return all_results
def evalEmbCls(args): """Evaluate graph/network embedding via the multi-lable classification args - parsed arguments """ assert args, 'Valid args are expected' tstart = time.clock() tstampt = time.gmtime() rootdims = None # Indices of the root dimensions dimrds = None # Dimension density ratios relative to the possibly indirect super cluster (dimension), typically >= 1 dimrws = None # Dimension density ratios relative to the possibly indirect super cluster (dimension), typically <= 1 dimwsim = None # Dimension weights (significance ratios) dimwdis = None # Dimension weights for the dissimilarity dimnds = None # Dimensions members (nodes) number if args.mode == 'eval': # 1.1 Load labels mat = loadmat(args.network) # Compressed Sparse Column format # A = mat[args.adj_matrix_name] # graph = sparse2graph(A) labels_matrix = mat[args.label_matrix_name] # csc_matrix labels_count = labels_matrix.shape[1] mlb = MultiLabelBinarizer(range(labels_count)) lbnds = labels_matrix.shape[0] # The number of labeled nodes else: lbnds = None # 1.2 Load Embedding # model = KeyedVectors.load_word2vec_format(args.embedding, binary=False) dimweighted = False dis_features_matrix = None # Dissimilarity features matrix embext = os.path.splitext(args.embedding)[1].lower() if embext == '.nvc': tld0 = time.clock() features_matrix, rootdims, dimrds, dimrws, dimwsim, dimwdis, dimnds = loadNvc(args.embedding) tldf = time.clock() print('Feature matrix loaded on {} sec'.format(int(tldf - tld0))) # Cut loaded data to rootdims if required if args.rdims: if args.dims is None: args.dims = 1 # or anything else <= len(rootdims) else: raise ValueError('Exclusive options --dimensions and --root-dims are specified') if args.dims is not None: #rdims if args.dims < rootdims.size: args.dims = rootdims.size if args.dims > features_matrix.shape[1]: args.dims = features_matrix.shape[1] print('Reduction to the {} dimensions E [{}, {}] started at {} sec' .format(args.dims, rootdims.size, features_matrix.shape[1], int(tldf))) # Cut the features_matrix to args.dims E rootdims .. totaldims fm = dok_matrix((features_matrix.shape[0], args.dims), dtype=features_matrix.dtype) # First, fill the root dims for j, idim in enumerate(rootdims): fm[:, j] = features_matrix.getcol(idim) # print('> colmat type: {}, shape: {}, attrs: {}'.format(type(colmat), colmat.shape, dir(colmat))) resdims = np.empty(args.dims, np.uint16) # rootdims resdims[:rootdims.size] = rootdims if args.dims > rootdims.size: # Fill remained dimensions with the ones having max density step and not belongning to the root if dimnds is not None: drds = [(i, d, dimnds[i]) for i, d in enumerate(dimrds)] # Sort by increasing density step and then number of nodes rdmin = min(dimrds) drds.sort(key=lambda x: x[1] + x[2] / features_matrix.shape[0] * rdmin) else: drds = [(i, d) for i, d in enumerate(dimrds)] drds.sort(key=lambda x: x[1]) # print('drds: ', drds[:5], '..', drds[-5:]) # print('rootdims: ', [(i, dimnds[i]) for i in rootdims[:5]], '..', [(i, dimnds[i]) for i in rootdims[-5:]]) droot = set(rootdims) for j in range(rootdims.size, args.dims): idim = drds.pop()[0] while idim in droot: idim = drds.pop()[0] resdims[j] = idim fm[:, j] = features_matrix.getcol(idim) rootdims = None features_matrix = fm del fm trd1 = time.clock() print(' features_matrix reduction completed within {} sec'.format(int(trd1 - tldf))) # Cut the accessory arrays to rootdims arrs = [dimrds, dimrws, dimwsim, dimwdis] for ia, arr in enumerate(arrs): # Omit None arrays if arr is None: continue tarr = np.empty(resdims.size, arr.dtype) for i, ir in enumerate(resdims): tarr[i] = arr[ir] arrs[ia] = tarr resdims = None print(' reduction of the accessory loaded data completed on {} sec'.format(int(time.clock() - trd1))) allnds = features_matrix.shape[0] if lbnds and allnds > lbnds and adjustRows(lbnds, features_matrix, True): print('WARNING, embedding matrices are reduced to the number of nodes in the labels matrix: {} -> {}' .format(allnds, lbnds), file=sys.stderr) # Omit dissimilarity weighting if required if args.no_dissim: dimwdis = None dimweighted = args.weighted_dims and dimwsim is not None if dimweighted: print('Node vectors are corrected with the dimension weights') if dimwdis is not None: dis_features_matrix = features_matrix.copy() w0 = 1E-8 # Zero weight placeholder for (i, j), v in features_matrix.items(): # Note: Weights cutting must be applied before the dimensions significance consideration # w0 is used because 0 assignement does not work in the cycle affecting the dictionary size features_matrix[i, j] = v * dimwsim[j] if not args.dim_vmin or v >= args.dim_vmin else w0 if dis_features_matrix is not None: for (i, j), v in dis_features_matrix.items(): dis_features_matrix[i, j] = v * dimwdis[j] if not args.dim_vmin or v >= args.dim_vmin else w0 dis_features_matrix = dis_features_matrix.toarray() #.todense() # order='C' if OPTIMIZED: sm.quantify(dis_features_matrix, sm.CMP_LE, w0, 0) else: np.where(dis_features_matrix > w0, dis_features_matrix, 0) features_matrix = features_matrix.toarray() #.todense() # order='C' if dimweighted: if OPTIMIZED: sm.quantify(features_matrix, sm.CMP_LE, w0, 0) else: np.where(features_matrix > w0, features_matrix, 0) else: features_matrix = None if embext == '.mat': mat = loadmat(args.embedding) # Map nodes to their features features_matrix = np.array(mat['embs'], dtype=np.float32, order='C') del mat elif embext == '.csv': features_matrix = np.loadtxt(args.embedding, dtype=np.float32, delimiter=',') else: # ssv # Try to parse the file as space separated values features_matrix = np.loadtxt(args.embedding, dtype=np.float32) #raise ValueError('Embedding in the unknown format is specified: ' + args.embedding) allnds = features_matrix.shape[0] # Ensure that the adday can be resized if required (owns it's data ranther than a view) if lbnds and allnds > lbnds: if isinstance(features_matrix, np.ndarray) and not features_matrix.flags['OWNDATA']: features_matrix = features_matrix[:lbnds, ...] else: reduced = adjustRows(lbnds, features_matrix, True) assert reduced, 'features_matrix is expected to be reduced from {} to {} items'.format(allnds, lbnds) embname = os.path.splitext(args.embedding)[0] # COnsider that .nvc embeddings support multiple options on loading and should be retained if embext != '.nvc': embrds = embname + '.mat' embdir, namext = os.path.split(args.embedding) move(args.embedding, ''.join((embdir, '/', 'full_', namext))) else: embrds = ''.join((embname, '_rds', str(lbnds), '.mat')) print('WARNING, features matrix is reduced to the number of nodes in the labels matrix: {} -> {}.' ' Saving the reduced features to the {}...' .format(allnds, lbnds, embrds), file=sys.stderr) savemat(embrds, mdict={'embs': features_matrix}) # Cut weights lower dim_vmin if required if args.dim_vmin and not dimweighted: if OPTIMIZED: sm.quantify(features_matrix, sm.CMP_LT, args.dim_vmin, 0) else: np.where(features_matrix >= args.dim_vmin, features_matrix, 0) # Binarize if required in case of hamming distance evaluation if args.binarize: medbin = args.metric == 'hamming' # Binarize to the median instead of reducing mean square error sm.binarize(features_matrix, medbin) if dis_features_matrix is not None: sm.binarize(dis_features_matrix, medbin) assert args.metric != 'jacnop' or (features_matrix.max() <= 1 and features_matrix.min() >= -1), ( 'Jacnop should be applied only to the features matrix normalized to 1, i.e. max(abs(mat)) = 1') # Generate Gram (nodes similarity) matrix only ----------------------------- if args.mode == 'gram': # Note: metric here is distance metric = 1 - sim_metric if OPTIMIZED: gram = np.empty((features_matrix.shape[0], features_matrix.shape[0]), dtype=ValT) metid = sm.sim_id(args.metric) else: metric = args.metric # Explicitly assign jaccard distance because other metrics are implicitly used as distance metrics if metric == 'jaccard': metric = dist_jaccard elif metric == 'jacnop': metric = dist_jacnop # metric = lambda u, v: 1 - sm.sim_jaccard(u, v) if dis_features_matrix is None: if OPTIMIZED: # Note: pdist takes too much time with custom dist funciton: 1m46 sec for cosine, 40 sec for jaccard vs 8 sec for "cosine" sm.pairsim(gram, features_matrix, metid) # gram2 = squareform(ValT(1) - pdist(X_train, metric)) # cosine, jaccard, hamming # print('Gram:\n', gram, '\nOrig Gram:\n', gram2) else: gram = squareform(ValT(1) - pdist(features_matrix, metric)) # cosine, jaccard, hamming else: if OPTIMIZED: sm.pairsimdis(gram, features_matrix, dis_features_matrix, metid) else: if metric == 'cosine': metric = dist_cosine elif metric == 'hamming': metric = dist_hamming if OPTIMIZED: dis_metric = sm.dissim # else: # dis_metric = metric # Note: 1-sim metric performs less accurate than the custom dissimilarity metric gram = pairsimdis(features_matrix, dis_features_matrix, metric, dis_metric) # Save resulting Gram (network nodes similarity) matrix savemat(args.output, mdict={'gram': gram}) return # Evaluate Embedding ------------------------------------------------------ # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) # features_matrix = np.asarray([model[str(node)] for node in range(len(graph))]) # 2. Shuffle, to create train/test groups assert labels_matrix.shape[0] == features_matrix.shape[0], 'All evaluating nodes are expected to be labeled' shuffles = [] for x in range(args.num_shuffles): if dis_features_matrix is not None: shuffles.append(skshuffle(features_matrix, dis_features_matrix, labels_matrix)) else: shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group # all_results = defaultdict(list) if args.all: training_percents = np.asarray(range(1, 10)) * .1 else: training_percents = _trainperc_dfl averages = ["micro", "macro"] res = np.full([args.num_shuffles, len(training_percents), len(averages)], np.nan, dtype=ValT) # for train_percent in training_percents: # for shuf in shuffles: Xdis = None Xdis_train = None res_ave = None # Average results ii = 0 jj = 0 try: for ii, train_percent in enumerate(training_percents): training_size = int(train_percent * features_matrix.shape[0]) if OPTIMIZED: gram = np.empty((training_size, training_size), dtype=ValT) gram_test = np.empty((features_matrix.shape[0] - training_size, training_size), dtype=ValT) for jj, shuf in enumerate(shuffles): print('Training set #{} ({:.1%}), shuffle #{}'.format(ii, train_percent, jj)) if dis_features_matrix is not None: X, Xdis, y = shuf #assert len(X) == len(Xdis), 'Feature matrix partitions validation failed' else: X, y = shuf # training_size = int(train_percent * X.shape[0]) X_train = X[:training_size] if dis_features_matrix is not None: Xdis_train = Xdis[:training_size] y_train_ = y[:training_size] X_test = X[training_size:] if dis_features_matrix is not None: Xdis_test = Xdis[training_size:] if OPTIMIZED: y_test = sm.colindicesnz(y[training_size:].tocoo()) else: cy = y[training_size:].tocoo() y_test = [[] for _ in range(cy.shape[0])] for i, j in zip(cy.row, cy.col): y_test[i].append(j) cy = None # find out how many labels should be predicted top_k_list = [len(l) for l in y_test] # Classification strategy and similarity matrices # clf = TopKRanker(SVC(kernel=args.kernel, cache_size=4096, probability=True), 1) # TopKRanker(LogisticRegression()) clf = None clweight = 'balanced' if args.balance_classes else None if args.solver is None: clf = TopKRanker(SVC(kernel=args.kernel, cache_size=4096, probability=True, class_weight=clweight, gamma='scale')) # TopKRanker(LogisticRegression()) else: clf = TopKRanker(LogisticRegression(solver=args.solver, class_weight=clweight, max_iter=512)) if args.solver is None and args.kernel == 'precomputed': # Note: metric here is distance metric = 1 - sim_metric if OPTIMIZED: metid = sm.sim_id(args.metric) else: metric = args.metric # Explicitly assign jaccard distance because other metrics are implicitly used as distance metrics if metric == 'jaccard': metric = dist_jaccard elif metric == 'jacnop': metric = dist_jacnop # metric = lambda u, v: 1 - sm.sim_jaccard(u, v) if dis_features_matrix is None: if OPTIMIZED: # Note: pdist takes too much time with custom dist funciton: 1m46 sec for cosine, 40 sec for jaccard vs 8 sec for "cosine" sm.pairsim(gram, X_train, metid) # gram2 = squareform(ValT(1) - pdist(X_train, metric)) # cosine, jaccard, hamming # print('Gram:\n', gram, '\nOrig Gram:\n', gram2) sm.pairsim2(gram_test, X_test, X_train, metid) # gram_test2 = ValT(1) - cdist(X_test, X_train, metric); # print('\n\nGram test:\n', gram_test, '\nOrig Gram test:\n', gram_test2) else: gram = squareform(ValT(1) - pdist(X_train, metric)) # cosine, jaccard, hamming gram_test = ValT(1) - cdist(X_test, X_train, metric); else: if OPTIMIZED: sm.pairsimdis(gram, X_train, Xdis_train, metid) sm.pairsimdis2(gram_test, X_test, X_train, Xdis_test, Xdis_train, metid) else: if metric == 'cosine': metric = dist_cosine elif metric == 'hamming': metric = dist_hamming if OPTIMIZED: dis_metric = sm.dissim # else: # dis_metric = metric # Note: 1-sim metric performs less accurate than the custom dissimilarity metric gram = pairsimdis(X_train, Xdis_train, metric, dis_metric) # gram_test = 1 - cdist(X_test, X_train, metric); #gram_test = np.empty((len(X_test), training_size), dtype=ValT) for i in range(len(X_test)): for j in range(training_size): # gram_test[i, j] = ValT(1) - metric(X_test[i], X_train[j]) - dis_metric(Xdis_test[i], Xdis_train[j]) # Note: positive gram matrix yields abit more accurate resutls gram_test[i, j] = ValT(1) - (metric(X_test[i], X_train[j]) + dis_metric(Xdis_test[i], Xdis_train[j])) / ValT(2) clf.fit(gram, y_train_) preds = clf.predict(gram_test, top_k_list) else: clf.fit(X_train, y_train_) preds = clf.predict(X_test, top_k_list) # results = {} # # for average in averages: # results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) # # all_results[train_percent].append(res) for kk,average in enumerate(averages): res[jj,ii,kk] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) finally: res_ave = np.nanmean(res, 0) res_std = np.nanstd(res, 0) print("F1 [micro macro]:") print(res_ave) if len(res_ave) >= 2: finres = np.nanmean(res_ave, 0) finstd = np.nanmean(res_std, 0) print("Average: {:.4F} ({:.4F}), {:.4F}".format(finres[0], finstd[0], finres[1])) else: finres = res_ave finstd = res_std if args.output and ii + jj >= 1: # Output only non-empty results; np.nansum(res_ave, 0) != 0 hbrief = np.uint16(0) if args.accuracy_detailed: # Evaluate 2-byte hash of the input args hf = md5() hf.update(' '.join(sys.argv).encode()) for i, b in enumerate(hf.digest()): hbrief = hbrief ^ b << (8 if i%2 else 0) # Output detailed accuracy results dname, fname = os.path.split(args.embedding) acrname = ''.join((dname, '/acr_', os.path.splitext(fname)[0], '_', str(hbrief), '.mat')) print('The detailed accuracy results are saved to: ', acrname) try: savemat(acrname, mdict={'res': res}) except IOError as err: print('WARNING, detailed accuracy results saving falied to {}: {}' .format(acrname, err), file=sys.stderr) with open(args.output, 'a') as fres: # Output the Header if required if not fres.tell(): fres.write('Dims\tWgh\tBin\tMetric \tNDs\tDVmin\t F1mic\tF1miSD\t F1mac\t Solver' '\tBCl\t ExecTime\t Folds\t StartTime \tInpHash\tEmbeds\n') # File name of the embedding and Dimensions number print('{: >4}\t{: >3d}\t{: >3d}\t'.format(features_matrix.shape[1], args.weighted_dims, args.binarize) , file=fres, end='') # Similarity Metric, weighting, no-dissim and dim-val-min if args.solver is None and args.kernel == 'precomputed': print('{: <7}\t{: >3d}\t'.format(args.metric[:7] , args.no_dissim), file=fres, end='') else: print('{: <7}\t{: >3}\t'.format('-', '-'), file=fres, end='') # F1 micro and macro (average value) print('{:<.4F}\t {:<.4F}\t{:<.4F}\t {:<.4F}\t '.format( args.dim_vmin, finres[0], finstd[0], finres[1]), file=fres, end='') # Solver and execution time print('{: >6}\t{: >3}\t {: >8d}\t'.format( (args.kernel if args.solver is None else args.solver)[:6] , int(args.balance_classes), int(time.clock() - tstart)), file=fres, end='') # Folds and the timestamp # Correct folds to show counts instead of indices jj += 1 if jj == args.num_shuffles: ii += 1 print('{: >2}.{:0>2}/{: >2}.{:0>2}\t {}\t'.format(ii, jj, res.shape[1], res.shape[0] , time.strftime('%y-%m-%d_%H:%M:%S', tstampt)), file=fres, end='') print('{: >7}\t{}\n'.format(str(hbrief) if hbrief else '-' , os.path.split(args.embedding)[1]), file=fres, end='')
print("Number of unique questions in this dataset " + str(len(unique_ids)) ) #this is the length of bit vector (number of unique qual_ids) # generate vectors to give to fit_transform in multilabelbinarizer to further generate unique 1-hot encoding transform_ids = [] for i in unique_ids: transform_ids.append([i]) transform_labels = [] for i in unique_labels: transform_labels.append([i]) # In[5]: # generate dictionary that maps labels and qual_ids to their respective 1-hot encoding enc = MultiLabelBinarizer() qual_ids_1hot = (enc.fit_transform(transform_ids)).astype(float) qual_ids_classes = enc.classes_ qual_ids_dict = dict(zip(unique_ids, qual_ids_1hot)) labels_1hot = enc.fit_transform(transform_labels).astype(float) labels_classes = enc.classes_ labels_dict = dict(zip(unique_labels, labels_1hot)) # In[6]: # generate final encoding final_encoding = [] second_try_flag = False for i in student_vectors: #loop over all the students interactions_vector = [] for j in student_vectors[
#print(one_hot.fit_transform(feature)) #print(one_hot.classes_) #Reverse one-Hot Encoding #print(one_hot.inverse_transform(one_hot.transform(feature))) #print(pd.get_dummies(feature[:,0])) #Multiclass One-Hot encoding multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"), ("Texas", "Florida"), ("Delware", "Florida"), ("Texas", "Alabama")] one_hot_multiclass = MultiLabelBinarizer() #print(one_hot_multiclass.fit_transform(multiclass_feature)) #print(one_hot_multiclass.classes_) #Encoding Ordinal Categories Features dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]}) scale_mapper = {"Low": 1, "Medium": 2, "High": 3} #print(dataframe['Score'].replace(scale_mapper)) dataframe = pd.DataFrame({ "Score":
def select_data(XX, YY, ctype, min_samples, outputfolder): # convert multilabel to multi-hot mlb = MultiLabelBinarizer() if ctype == 'diagnostic': X = XX[YY.diagnostic_len > 0] Y = YY[YY.diagnostic_len > 0] mlb.fit(Y.diagnostic.values) y = mlb.transform(Y.diagnostic.values) elif ctype == 'subdiagnostic': counts = pd.Series(np.concatenate( YY.subdiagnostic.values)).value_counts() counts = counts[counts > min_samples] YY.subdiagnostic = YY.subdiagnostic.apply( lambda x: list(set(x).intersection(set(counts.index.values)))) YY['subdiagnostic_len'] = YY.subdiagnostic.apply(lambda x: len(x)) X = XX[YY.subdiagnostic_len > 0] Y = YY[YY.subdiagnostic_len > 0] mlb.fit(Y.subdiagnostic.values) y = mlb.transform(Y.subdiagnostic.values) elif ctype == 'superdiagnostic': counts = pd.Series(np.concatenate( YY.superdiagnostic.values)).value_counts() counts = counts[counts > min_samples] YY.superdiagnostic = YY.superdiagnostic.apply( lambda x: list(set(x).intersection(set(counts.index.values)))) YY['superdiagnostic_len'] = YY.superdiagnostic.apply(lambda x: len(x)) X = XX[YY.superdiagnostic_len > 0] Y = YY[YY.superdiagnostic_len > 0] mlb.fit(Y.superdiagnostic.values) y = mlb.transform(Y.superdiagnostic.values) elif ctype == 'form': # filter counts = pd.Series(np.concatenate(YY.form.values)).value_counts() counts = counts[counts > min_samples] YY.form = YY.form.apply( lambda x: list(set(x).intersection(set(counts.index.values)))) YY['form_len'] = YY.form.apply(lambda x: len(x)) # select X = XX[YY.form_len > 0] Y = YY[YY.form_len > 0] mlb.fit(Y.form.values) y = mlb.transform(Y.form.values) elif ctype == 'rhythm': # filter counts = pd.Series(np.concatenate(YY.rhythm.values)).value_counts() counts = counts[counts > min_samples] YY.rhythm = YY.rhythm.apply( lambda x: list(set(x).intersection(set(counts.index.values)))) YY['rhythm_len'] = YY.rhythm.apply(lambda x: len(x)) # select X = XX[YY.rhythm_len > 0] Y = YY[YY.rhythm_len > 0] mlb.fit(Y.rhythm.values) y = mlb.transform(Y.rhythm.values) elif ctype == 'all': # filter counts = pd.Series(np.concatenate(YY.all_scp.values)).value_counts() counts = counts[counts > min_samples] YY.all_scp = YY.all_scp.apply( lambda x: list(set(x).intersection(set(counts.index.values)))) YY['all_scp_len'] = YY.all_scp.apply(lambda x: len(x)) # select X = XX[YY.all_scp_len > 0] Y = YY[YY.all_scp_len > 0] mlb.fit(Y.all_scp.values) y = mlb.transform(Y.all_scp.values) else: pass # save LabelBinarizer with open(outputfolder + 'mlb.pkl', 'wb') as tokenizer: pickle.dump(mlb, tokenizer) return X, Y, y, mlb
def transform(self, X): return MultiLabelBinarizer(classes=self.class_labels).fit_transform(X)