def evaluate_solution(users, urecovered, observed_index, xs=None, E=None, hidden_edges=None): """Evaluate the quality of the recovered user profile""" mse = mean_squared_error(users[observed_index, :], urecovered[observed_index, :]) if hidden_edges is None or len(hidden_edges) < 1: return mse, None labeler = MultiLabelBinarizer(classes=np.arange(xs.shape[1])) gold = labeler.fit_transform([E[e] for e in sorted(hidden_edges)]) # gold = np.array([E[e] for e in sorted(hidden_edges)]) eh = sorted(hidden_edges) heads, tails = zip(*eh) Cr = np.dot(urecovered, xs.T) Dr = np.abs(Cr[heads, :] - Cr[tails, :]) # TODO prediction here could be better: instead of predict the k best # directions all the time, look at revealed edge to compute threshold of # similarity (i.e replace 0.05) best_dirs = np.argsort(Dr, 1).astype(int)[:, :2] pred = [] for all_dir, suggestion in zip(Dr, best_dirs): my_pred = [suggestion[0]] if all_dir[suggestion[1]] < 0.05: my_pred.append(suggestion[1]) pred.append(my_pred) pred = labeler.fit_transform(pred) return mse, f1_score(gold, pred, average='samples')
def run_classifier(sentences, labels, test_doc_list, output_file_path_list): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC estimator = LinearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) for test_doc, output_file_path in zip(test_doc_list, output_file_path_list): test_sentences = doc2sentences([test_doc]) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) predictions = classifier.predict(sentence_matrix) from lxml import etree document = etree.Element('doc') doc_tree = etree.ElementTree(document) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] etree.SubElement(document, "Sent", classes=", ".join(curr_pred)).text = test_sentences[i] doc_tree.write(output_file_path)
def generateTrainFeatures(L): """ This function generates the training data features and its target labels. Input: L : The number of training data Output: trainX -> a (L * 2000) numpy matrix representing the 2000 features for each of the L training samples trainY -> (L * 185) numpy matrix representing the target class of the training samples Logic: The input text is read, preprocessed to remove stop words, and is appended to a list. Similarly, each of the target class values are read into a list. Sklearn package TFIDF vectorizer is used for generating TFIDF matrix for the 2000 frequent words. The multi-label classification algorithms require a target Y variable of the form, (nsamples * nclasses), multilabel binarizer is used for converting the list of classes to a matrix form. """ global classOrder X = [] Y = [] # read the input for i in range(L): categories = raw_input() target = [int(y) for y in categories.split(" ")] del target[0] meaningfulWords = readInput() Y.append(target) X.append(meaningfulWords) # construct TF-IDF matrix representing the features trainX = vectorizer.fit_transform(X).toarray() # convert the target label list to a suitable matrix form mlb = MultiLabelBinarizer() trainY = mlb.fit_transform(Y) # for representing the order of the classes classOrder = mlb.classes_ return (trainX, trainY)
def read_all_data(p): img_src = "images/" df = pd.read_pickle("frame_no_stem.pkl") images = __read_all_images(img_src) print("Finished reading images") x_images = [] x_desc = [] y_category = [] all_categories = set() for asin in df.index.values: if asin in images: data = images[asin] x_images.append(data) item = df.loc[asin] x_desc.append(item.description) cate = item.categories y_category.append(cate) for c in cate: all_categories.add(c) print("Finished reading dataframe") mlb = MultiLabelBinarizer() y_total = mlb.fit_transform(y_category) x_images = np.array(x_images) x_desc = np.array(x_desc) return x_images,x_desc, y_total
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def get_training_data(window_size_ms, train_time_sec=30): #loop until empty input is detected X = [] y = [] print "Training time for each key is {} seconds".format(train_time_sec) i = 0 while True: s = raw_input('Press <enter> to begin training key {} or q-<enter> to quit'.format(i)) if s: break j = 0 while j < train_time_sec: j += (window_size_ms / float(1000)) freq_spect = read_spectral_data_for_time(window_size_ms) X.append(freq_spect) y.append([i]) #increment key counter i += 1 mb = MultiLabelBinarizer() y = mb.fit_transform(y) X = np.asarray(X) y = np.asarray(y) return X, y
def main(): #Explore the data for how many class labels reviewsDict = {} with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/reviewUsefulDict.pickle") as f: reviewsDict = pickle.load(f) print "Reviews Dictionary loaded .. " ''' usefulCountDict = {} for key, value in reviewsDict.iteritems(): if value not in usefulCountDict: usefulCountDict[value] = 1 else: usefulCountDict[value] = usefulCountDict[value]+1 pprint(usefulCountDict) ''' corpus, target = DictToList(reviewsDict) vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True) XAll = vectorizer.fit_transform(corpus) mlb = MultiLabelBinarizer() yAll = mlb.fit_transform(target) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.fv", 'w') as f: pickle.dump(XAll, f) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.target2", 'w') as f: pickle.dump(yAll, f) with open("/Users/huzefa/Workspace/College-Fall-2015/Search/Dataset/Task2/Onlyreviews.mlb", 'w') as f: pickle.dump(mlb, f) print "Dumped featrue vectors .... "
def run_classifier(sentences, labels, test_docs): import numpy as np train_matrix, tfidf = tf_idf_fit_transform(sentences) test_sentences = doc2sentences(test_docs) sentence_matrix = tfidf.transform(test_sentences) print("Shape of sentence matrix : ", sentence_matrix.shape) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() label_matrix = mlb.fit_transform(labels) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import linearSVC # estimator = SVC(kernel='linear') estimator = linearSVC() classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, label_matrix) predictions = classifier.predict(sentence_matrix) import csv with open("classified.csv", "w") as fl: writer = csv.writer(fl) for i in range(len(test_sentences)): curr_pred = [mlb.classes_[x] for x in range(predictions.shape[1]) if predictions[i][x]==1] writer.writerow((test_sentences[i], curr_pred))
def perform_train_test_split(db_name=ds.DEFAULT_DB_NAME, train_size=ds.DEFAULT_TRAININGSET_SIZE): """ Get all document_ids of given database and split's it according to given train_size. The tricky part is that we n :param db_name: Name of database to split documents (default DEFAULT_DB_NAME) :param train_size: Size in percentage [0,1] of the training set. :return splitted_dataset - List of lists [[DEFAULT_DATASET_LIST_INDEX_TRAINING], [DEFAULT_DATASET_LIST_INDEX_TEST]] """ database = db.couch_database(db_name) all_docs = database.getAllDocumentsFromDatabase() doc_ids_list = [] all_tag_list = [] i = 0 for row in all_docs.rows: document = row.doc #append the document id to doc_ids_list doc_ids_list.append(document[cp.COUCHDB_DOCUMENT_FIELD_ID]) tag_list = [] #if document has tags than split and add them if pp.STACKEXCHANGE_TAGS_COLUM in document.keys(): document_tags = document[pp.STACKEXCHANGE_TAGS_COLUM] tags_list = document_tags.split(sep=dtm_provider.TAG_SPLIT_separator) for tag in tags_list: #remove the closing tag (last item) tag_list.append(tag[:-1]) #append the list of document tags to all_tag_list all_tag_list.append(tag_list) i += 1 if i > 10000: break mlb = MultiLabelBinarizer() tags_encoded = mlb.fit_transform(all_tag_list) print(len(doc_ids_list)) splitted_dataset = cross_validation.train_test_split(doc_ids_list,tags_encoded, train_size=0.8, random_state=42, stratify=tags_encoded)
class VectorizedData: """ Simple container that holds the input dataset in a sklearn-friendly form, with X, y numpy vectors. TODO: we ignore # of matches for each fbpath """ def __init__(self, data, Xdict=None, Ydict=None): fdict = [q_to_fdict(q) for q in data] lset = [q_to_lset(q) for q in data] if Xdict is None: self.Xdict = DictVectorizer() self.X = self.Xdict.fit_transform(fdict) else: self.Xdict = Xdict self.X = self.Xdict.transform(fdict) if Ydict is None: self.Ydict = MultiLabelBinarizer() self.Y = self.Ydict.fit_transform(lset) else: self.Ydict = Ydict # Filter out data with unknown labels, MultiLabelBinarizer() cannot # handle this known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset] lset_n = sum([len(ls) for ls in lset]) known_lset_n = sum([len(ls) for ls in known_lset]) if known_lset_n < lset_n: print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr) self.Y = self.Ydict.transform(known_lset) def cfier_score(self, cfier, scorer): """ Measure cfier performance on this dataset. scorer -> lambda cfier, X: cfier.predict_proba(X) (or decision_function when probabilities not predicted) """ skl_score = cfier.score(self.X.toarray(), self.Y) # XXX: Matched paths might/could be weighted by their nMatches too... # Measure prediction performance Ypred = cfier.predict(self.X.toarray()) n_q = float(np.size(self.Y, axis=0)) # number of questions where all correct paths have been recalled recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q # number of questions where at least one correct path has been recalled recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q # number of *PATHS* (not q.) that were correct precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred)) # Measure scoring performance Yscores = scorer(cfier, self.X.toarray()) # MRR of first correct path mrr = mrr_by_score(self.Y, Yscores) # number of questions where at least one correct path has been recalled in top N paths # TODO return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
def createDataMatrix(ngram_features, character_gram_features,tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories): tokenizer_case_preserve = Tokenizer(preserve_case=True) tokenizer = Tokenizer(preserve_case=False) handmade_features, cll, cll2 = [], [], [] for tweet in tweetText: feat = [] feat.append(exclamations(tweet)) feat.append(questions(tweet)) feat.append(questions_and_exclamation(tweet)) feat.append(emoticon_negative(tweet)) feat.append(emoticon_positive(tweet)) words = tokenizer_case_preserve.tokenize(tweet) #preserving casing feat.append(allCaps(words)) feat.append(elongated(words)) feat.append(questions_and_exclamation(words[-1])) handmade_features.append(np.array(feat)) words = tokenizer.tokenize(tweet) words = [word.strip("_NEG") for word in words] cll.append(getClusters(voca_clusters, words)) #cll2.append(getClusters(voca_handmade, words)) bl = csr_matrix(bing_lius(tweetText, pos, different_pos_tags, pos_text)) nrc_emo = csr_matrix(nrc_emotion(tweetText, pos, different_pos_tags, pos_text )) mpqa_feat = csr_matrix(mpqa(tweetText,pos, different_pos_tags, pos_text)) handmade_features = np.array(handmade_features) mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_clusters.values()))) cluster_memberships_binarized = csr_matrix(mlb.fit_transform(cll)) #mlb = MultiLabelBinarizer(sparse_output=True, classes = list(set(voca_handmade.values()))) #cluster_memberships_binarized_2 = csr_matrix(mlb.fit_transform(cll2)) hasht = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-unigrams.txt')) # sent140aff_data = csr_matrix(sent140aff(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt')) hasht_bigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../lexicons/HashtagSentimentAffLexNegLex/HS-AFFLEX-NEGLEX-bigrams.txt')) # sent140affBigrams=csr_matrix(sent140aff_bigrams(tweetText, pos, different_pos_tags, pos_text, '../../lexicons/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-bigrams.txt')) sentQ = csr_matrix(get_sentiwordnet(pos_text, pos)) pos_features = csr_matrix(pos_features) handmade_features = csr_matrix(handmade_features) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, # sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) # ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, cluster_memberships_binarized, handmade_features, pos_features, sent140affBigrams, hasht_bigrams, hasht, sent140aff_data, bl, mpqa_feat, nrc_emo), dtype=float) ffeatures = scipy.sparse.hstack((ngram_features, character_gram_features, sentQ, handmade_features, pos_features, cluster_memberships_binarized, bl, mpqa_feat, nrc_emo, hasht, hasht_bigrams ), dtype=float) # print ngram_features.shape, character_gram_features.shape, cluster_memberships_binarized.shape, handmade_features.shape, pos_features.shape, # sent140affBigrams.shape, hasht_bigrams, hasht.shape, sent140aff_data.shape, bl.shape, mpqa_feat.shape, nrc_emo.shape y=[] for i in categories: if i=='positive': y.append(1) elif i == 'negative': y.append(-1) elif i == 'UNKNOWN': y.append(0) else: print i ffeatures = normalize(ffeatures) # ffeatures, y = shuffle(ffeatures,y) return ffeatures, y
def xval(clf, x, y, train_index, test_index): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(x_train, y_train) mlb = MultiLabelBinarizer() y_pred = clf.predict_proba(x_test) mse = mean_squared_error(mlb.fit_transform(label_binarize(y_test, clf.classes_)), y_pred) acc = accuracy_score(y_test, y_pred.argmax(axis=1)) evals = clf.get_num_evals() return mse, acc, evals
def test_BRKnna_no_labels_take_closest(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid2', 'lid3'], ['lid0', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(n_neighbors=2, threshold=0.6, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() print(pred) np.testing.assert_array_equal([[1, 0, 0, 0, 0]], pred)
def test_BRKnna_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
def test_BRKnnb_predict_two_samples(self): data = csr.csr_matrix([[0, 1], [1, 1.1], [1, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid5'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [0, 0, 1, 1]], pred)
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20,4,tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf,processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int( len(sets["test"]) *(top_percentage/100.0) ) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [ predicted_probs[j,i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]]= z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
def get_data(train_file, test_file): X_train, Y_train = load_data(train_file) X_train = [ln.split('\t')[1] for ln in X_train] X_test, Y_test = load_data(test_file) X_test = [ln.split('\t')[1] for ln in X_test] mlb = MultiLabelBinarizer() Y_train = [set(s.split('_')) - {'None'} for s in Y_train] Y_test = [set(s.split('_')) - {'None'} for s in Y_test] Y_train = mlb.fit_transform(Y_train) Y_test = mlb.transform(Y_test) return X_train, X_test, Y_train, Y_test, mlb.classes_
def test_BRKnnb_auto_optimize_k(self): data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True) # noinspection PyUnusedLocal def fun(s, X, y_): return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]] BRKNeighborsClassifier._get_split = fun knn.fit(data, y) self.assertEquals(3, knn.n_neighbors) pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred)
def get_all_data(): ''' Get data for educational subjects classifier''' print "Gathering data" data_train = [] labels_train_normal = [] for page in [x for x in Page.objects if x.cze]: data_train.append(page.cleannostops.encode('utf-8')) labs = [x.id for x in page.labels_all] labs = set(labs) labels_train_normal.append(labs) mlb = MultiLabelBinarizer() labels_train = mlb.fit_transform(labels_train_normal) print "Saving MLB" path = os.path.join(makepath('model'), DEFAULT_FILENAMES['MultiLabelBinarizer'] + '.pkl') joblib.dump(mlb, path) print "Saved" return (data_train, labels_train)
def _get_coursera_corpus(self): """collect coursera course text and metadata""" with open('./data/coursera/coursera_courses.json') as c_file: coursera_courses = json.load(c_file) course_id_to_index = {} # dict to allow reverse searching from id course_text_list = [] course_list = [] course_categories = [] i = 0 for course in coursera_courses['elements']: if course['language'] == 'en': course_id_to_index[course['id']] = i course_text_list.append(self.concatenate_coursera_text_data(course)) course_list.append(course) if self.categorizer: course_categories.append(course['links'].get('categories', [-1])) i += 1 if self.categorizer: # get category list cat_info_list = coursera_courses['linked']['categories'] self.cat_id_to_name = {cat['id']: {'name':cat['name'], 'shortName':cat['shortName']} for cat in cat_info_list} # binarize labels and discard low-count categories mlb = MultiLabelBinarizer() course_cats_binarized = mlb.fit_transform(course_categories) # filter to only tags with > 40 courses mask = course_cats_binarized.sum(axis=0) > 40 course_cats_binarized = course_cats_binarized[:, mask] self.course_cats_binarized = course_cats_binarized # create dict to get back from masked index, to index, to id label_arr_to_cat_id = {} for i, k in enumerate(mask.nonzero()[0].tolist()): label_arr_to_cat_id[i] = mlb.classes_[k] self.label_arr_to_cat_id = label_arr_to_cat_id return course_list, course_text_list, course_id_to_index
def load_movie_data(): fr = open("labels_summary.txt","r") x_data, y_data = [], [] lb = createLB() mlb = MultiLabelBinarizer() label_set = get_labels_set() for line in fr.readlines(): line = line.rstrip() line_datas = line.split("--") summary = line_datas[-1] labels = line_datas[-2].split(' ') labels = [item for item in labels if item in label_set] if len(labels) == 0: continue labels = lb.transform(labels) x_data.append(summary) y_data.append(labels) y_data = mlb.fit_transform(y_data) return x_data, y_data, mlb, lb
def create_model(key, answers, tags): filename = '%s/%s.pickle' % (folder, key) X_train = np.array(answers) y_train_text = tags mlb = MultiLabelBinarizer() Y = mlb.fit_transform(y_train_text) classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, Y) # Serialize both the pipeline and binarizer to disk. with open(filename, 'wb') as f: pickle.dump((mlb, classifier), f)
class predictor(): # In this perhaps pass in customizable parameters so # __init__(self, loss="hinge", penalty="l2") # This way, we can try out different loss functions easily def __init__(self): self.trainExamples = ['exodus_gods_and_kings.p', 'how_to_train_your_dragon_2.p', 'bears.p', 'see_no_evil_2.p', 'addicted.p', "the_internet's_own_boy_the_story_of_aaron_swartz.p", 'the_salt_of_the_earth.p', 'the_other_woman.p', 'project_almanac.p', 'edge_of_tomorrow.p', 'maya_the_bee_movie.p', 'cowspiracy_the_sustainability_secret.p', "let's_be_cops.p", "winter's_tale.p", 'the_trip_to_italy.p', 'yellowbird.p', 'alexander_and_the_terrible_horrible_no_good_very_bad_day.p', 'rosewater.p', 'the_hero_of_color_city.p', 'endless_love.p', 'dracula_untold.p', 'dumb_and_dumber_to.p', 'tomorrowland.p', 'the_hunger_games_mockingjay_part_1.p', 'tammy.p', 'hot_tub_time_machine_2.p', 'lucy.p', 'the_lego_movie.p', 'the_judge.p', 'cake.p', 'st_vincent.p', 'black_or_white.p', 'american_sniper.p', 'mr_peabody_&_sherman.p', 'this_is_where_i_leave_you.p', 'x-men_days_of_future_past.p', 'non-stop.p', 'get_on_up.p', 'the_fault_in_our_stars.p', 'song_one.p', 'robocop.p', 'into_the_storm.p', 'a_most_wanted_man.p', 'the_good_lie.p', 'wild.p', 'the_maze_runner.p', 'beyond_the_lights.p', 'divergent.p', 'spring.p', 'as_above_so_below.p', 'noble.p', 'hercules.p', 'i-lived&y=2015.p', 'night_at_the_museum_secret_of_the_tomb.p', 'planes:fire_&_rescue.p', 'old_fashioned.p', 'the_identical.p', 'dawn_of_the_planet_of_the_apes.p', 'cabin_fever_patient_zero.p', 'ride_along.p', 'dear_white_people.p', 'if_i_stay.p', 'red_army.p', 'the_boxtrolls.p', 'captain_america_the_winter_soldier.p', 'virunga.p', 'the_interview.p', 'earth_to_echo.p', 'a_walk_among_the_tombstones.p', 'persecuted.p', 'the_book_of_life.p', 'unbroken.p', 'the_drop.p', 'need_for_speed.p', 'brick_mansions.p', 'maleficent.p', 'blended.p', "devil's_due.p", 'jessabelle.p', 'fear_clinic.p', 'gone_girl.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'kill_the_messenger.p', 'my_little_pony_equestria_girls.p', 'rio_2.p', 'big_hero_6.p', 'guardians_of_the_galaxy.p', 'noah.p', 'the_hobbit_the_battle_of_the_five_armies.p', 'i_frankenstein.p', 'the_november_man.p', 'the_pyramid.p', 'and_so_it_goes.p', 'birdman_or_the_unexpected_virtue_of_ignorance.p', 'inherent_vice.p', 'merchants_of_doubt.p', 'iris.p', 'lambert,_stamp.p'] self.testExamples = [x for x in util2.getMovieDataset() if x not in self.trainExamples] # Standard DictVectorizer fitted with all colors as the features. self.dVec = DictVectorizer(sparse=False) self.dVec.fit([dict((feature,0) for feature in util2.getColors())]) # Standard MultiLabelBinarizer with all genre names self.mlb = MultiLabelBinarizer() self.pipeline = Pipeline([ ('organizeData', Movie_Data_Aggregator()), ('union', FeatureUnion( transformer_list = [ ('colors', Pipeline([ ('selector', Data_Selector(key='colors')), ('dVec', self.dVec), ])), ('subs', Pipeline([ ('selector', Data_Selector(key='subs')), ('tfidf', TfidfVectorizer(strip_accents='ascii', max_features=15)), ])), ], transformer_weights={ 'colors': 0.5, 'subs': 0.5, }, )), ('sgd', SGDClassifier(alpha= 1e-06, loss="perceptron", n_iter= 150, penalty="l2")), ]) # OneVsRestClassifier used for prediction self.classif = OneVsRestClassifier(self.pipeline) def learnPredictor(self, numbers=False): train_genres = self.mlb.fit_transform(util2.getCorrectGenres(self.trainExamples)) self.classif.fit(self.trainExamples, train_genres) return train_genres def predict(self, numbers=False): return self.classif.predict(self.testExamples)
class MultiLabelDataset(Dataset): def __init__(self, csv_path, img_path, transform=None): tmp_df = pd.read_csv(csv_path) self.mlb = MultiLabelBinarizer() self.img_path = img_path self.transform = transform self.X_train = tmp_df['image_name'] self.y_train = self.mlb.fit_transform(tmp_df['tags'].str.split()).astype(np.float32) def __getitem__(self, index): img = Image.open(os.path.join(self.img_path, self.X_train[index])) img = img.convert('RGB') if self.transform is not None: img = self.transform(img) label = torch.from_numpy(self.y_train[index]) return img, label def __len__(self): return len(self.X_train.index)
def run_classifierAccuracy(terms, labels, testSentences, testLabels): labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \ "Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] import numpy as np class_terms_matrix, tfidf = tf_idf_fit_transform(terms) sentence_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", sentence_matrix.shape) # print("Original order of lables:") # print(labels) from sklearn.metrics.pairwise import cosine_similarity similarity_matrix = cosine_similarity(sentence_matrix, class_terms_matrix) similarity_matrix = binary_rel(similarity_matrix) predictions = [] for i in range(len(testSentences)): predictions.append([labels[x] for x in range(similarity_matrix.shape[1]) if similarity_matrix[i][x]==1]) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=labels) # mlb = MultiLabelBinarizer() test_label_matrix = mlb.fit_transform(testLabels) predictions = mlb.transform(predictions) print("Shape of label matrix : ", test_label_matrix.shape) print("Labels : ", mlb.classes_) from sklearn.metrics import f1_score, precision_score, recall_score print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None)) print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None)) print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
def _create_classifier(): data_train = [ln.rsplit(None, 1) for ln in open(_download())] X_train, Y_train = zip(*data_train) del data_train mlb = MultiLabelBinarizer() Y_train = [set(s.split('_')) - {'None'} for s in Y_train] Y_train = mlb.fit_transform(Y_train) clf = make_pipeline(TfidfVectorizer(sublinear_tf=True, use_idf=False), LinearSVC(dual=False)) # XXX class_weight="auto" causes a lot of deprecation warnings, but it # still fares better than the new class_weight="balanced" heuristic. # n_jobs=-1 causes nosetests to hang so that is disabled for now. params = {'tfidfvectorizer__use_idf': [True, False], 'tfidfvectorizer__sublinear_tf': [True, False], 'linearsvc__class_weight': ["auto", None], 'linearsvc__C': [.01, .1, 1, 10, 100, 1000], 'linearsvc__penalty': ['l1', 'l2'], } clf = OneVsRestClassifier(_GridSearch(clf, params, scoring='f1', verbose=1, cv=5)) return clf.fit(X_train, Y_train), mlb
def train(window_size_ms, train_time_sec=30, clf = OneVsRestClassifier(DecisionTreeClassifier()), n_keys=2): #loop until empty input is detected X = [] y = [] labels = [(i,) for i in range(n_keys+1)] mb = MultiLabelBinarizer() labels = mb.fit_transform(labels) print "Training time for each key is {} seconds".format(train_time_sec) for label_num, label in enumerate(labels): raw_input('Press <enter> to begin training key {}'.format(label_num)) i = 0 while i < train_time_sec: i += (window_size_ms / float(1000)) freq_spect = read_spectral_data_for_time(window_size_ms) X.append(freq_spect) y.append(label) X = np.asarray(X) y = np.asarray(y) clf.fit(X, y) return (clf, mb)
def get_labels(csv_path): mlb = MultiLabelBinarizer() labels_file = os.path.join(csv_path) l = np.loadtxt(labels_file, dtype=np.dtype('string'), delimiter=",",skiprows=1) biz2labels = dict([(x[0],x[1].split()) for x in l]) bin_labels = mlb.fit_transform(biz2labels.values()) new_dict = dict() for i, bid in enumerate(biz2labels.keys()): new_dict[bid] = bin_labels[i] biz_labels_df = pandas.DataFrame.from_dict(new_dict, orient='index', dtype='bool') biz_labels_df.columns = ['good_for_lunch','good_for_dinner' ,'takes_reservations','outdoor_seating' ,'restaurant_is_expensive','has_alcohol' ,'has_table_service','ambience_is_classy' ,'good_for_kids'] return biz_labels_df
def get_classify(): X_train, Y_train = load_data() # 定义分类器 classifier = Pipeline([ ('counter', CountVectorizer(tokenizer=jieba_tokenizer)), # 标记和计数,提取特征用 向量化 ('tfidf', TfidfTransformer()), # IF-IDF 权重 ('clf', OneVsRestClassifier(LinearSVC())), # 1-rest 多分类(多标签) ]) mlb = MultiLabelBinarizer() Y_train = mlb.fit_transform(Y_train) # 分类号数值化 classifier.fit(X_train, Y_train) # X_test = ["数据分析"] # 把所有的测试文本存到一个list中 test_list = [] test_name = [] filelist2 = os.listdir(base_path + "data_test/") for files in filelist2: # print (files) test_name.append(files) f = open(base_path + "data_test/" + files, 'r') test_list.append(f.read()) prediction = classifier.predict(test_list) result = mlb.inverse_transform(prediction) f = open('result2.txt', 'w') for i in range(len(test_name)): f.write(str(test_name[i]) + ' ' + str(result[i]) + '\n') print (result, len(result)) num_dict = Counter(result) print (len(num_dict)) print ((num_dict[('1',)] + num_dict[('2',)] + num_dict[('3',)]) / float(len(result))) # 整数除整数为0,应把其中一个改为浮点数。
def featurize(): instances, labels = datahandler.load_corpus(settings.DATA_DIR) # when code testing limit the instances if settings.TEST: instances, labels = instances[:settings. N_TEST_INSTANCES], labels[:settings. N_TEST_INSTANCES] all_stats = datahandler.get_label_stats(labels) print( sum([v for k, v in all_stats["multilabel_distr"].items() if "," in k])) print( sum([v for k, v in all_stats["multilabel_count"].items() if "," in k])) # Encode labels labels_orig = np.array(labels, dtype=object) labels = [ l if l != -1 else [] for l in labels ] # TODO test with negative label instances as a label in learning label_encoder = MultiLabelBinarizer() # makes multihot label encodings y = label_encoder.fit_transform(labels) # Make sequence data from text # # Load predetermined holdout split with open(settings.EXPERIMENT_DATA, "rt") as exp_in_test: experiment = json.load(exp_in_test) # when code testing limit the instances by using the slicing done by predetermined holdout split indices if settings.TEST: idc_in, idc_out = train_test_split(np.arange( settings.N_TEST_INSTANCES), test_size=0.2) else: idc_in, idc_out = experiment["meta_holdin_indices"], experiment[ "meta_holdout_indices"] x, word_index, max_sequence_length = make_sequences(instances) x_in = x[idc_in] x_out = x[idc_out] y_in = y[idc_in] y_out = y[idc_out] instances_in = np.array(instances)[idc_in] instances_out = np.array(instances)[idc_out] labels_in = labels_orig[idc_in] labels_out = labels_orig[idc_out] logging.info("Train class category counts: \n{}\n---------\n" "Test class category counts: \n{}.".format( datahandler.get_label_info(labels_in), datahandler.get_label_info(labels_out))) emb_input_dim = len(word_index) + 1 output_units = len(label_encoder.classes_) # write the featurized data feature_data = { "x_in": x_in.tolist(), "x_out": x_out.tolist(), "y_in": y_in.tolist(), "y_out": y_out.tolist(), "instances_in": instances_in.tolist(), "instances_out": instances_out.tolist(), "labels_in": labels_in.tolist(), "labels_out": labels_out.tolist(), "max_sequence_length": max_sequence_length, "emb_input_dim": emb_input_dim, "output_units": output_units, "word_index": word_index, "all_stats": all_stats, "classes": label_encoder.classes_.tolist(), } util.write_features(feature_data) return feature_data
def eval(self, model, return_preds_and_labels=False, calibrate_conf_scores=False): """ Performs evaluation on a given model. :param model: The model on which to perform evaluation :type model: AdaptiveModel :param return_preds_and_labels: Whether to add preds and labels in the returned dicts of the :type return_preds_and_labels: bool :param calibrate_conf_scores: Whether to calibrate the temperature for temperature scaling of the confidence scores :type calibrate_conf_scores: bool :return all_results: A list of dictionaries, one for each prediction head. Each dictionary contains the metrics and reports generated during evaluation. :rtype all_results: list of dicts """ model.eval() # init empty lists per prediction head loss_all = [0 for _ in model.prediction_heads] preds_all = [[] for _ in model.prediction_heads] label_all = [[] for _ in model.prediction_heads] ids_all = [[] for _ in model.prediction_heads] passage_start_t_all = [[] for _ in model.prediction_heads] logits_all = [[] for _ in model.prediction_heads] for step, batch in enumerate( tqdm(self.data_loader, desc="Evaluating", mininterval=10)): batch = {key: batch[key].to(self.device) for key in batch} with torch.no_grad(): logits = model.forward(**batch) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) labels = model.prepare_labels(**batch) # stack results of all batches per prediction head for head_num, head in enumerate(model.prediction_heads): loss_all[head_num] += np.sum( to_numpy(losses_per_head[head_num])) preds_all[head_num] += list(to_numpy(preds[head_num])) label_all[head_num] += list(to_numpy(labels[head_num])) if head.model_type == "span_classification": ids_all[head_num] += list(to_numpy(batch["id"])) passage_start_t_all[head_num] += list( to_numpy(batch["passage_start_t"])) if calibrate_conf_scores: logits_all[head_num] += list(to_numpy(logits)) # Evaluate per prediction head all_results = [] for head_num, head in enumerate(model.prediction_heads): if head.model_type == "multilabel_text_classification": # converting from string preds back to multi-hot encoding from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=head.label_list) # TODO check why .fit() should be called on predictions, rather than on labels preds_all[head_num] = mlb.fit_transform(preds_all[head_num]) label_all[head_num] = mlb.transform(label_all[head_num]) if head.model_type == "span_classification" and calibrate_conf_scores: temperature_previous = head.temperature_for_confidence.item() logger.info( f"temperature used for confidence scores before calibration: {temperature_previous}" ) head.calibrate_conf(logits_all[head_num], label_all[head_num]) temperature_current = head.temperature_for_confidence.item() logger.info( f"temperature used for confidence scores after calibration: {temperature_current}" ) temperature_change = ( abs(temperature_current - temperature_previous) / temperature_previous) * 100.0 if temperature_change > 50: logger.warning( f"temperature used for calibration of confidence scores changed by more than {temperature_change} percent" ) if hasattr(head, 'aggregate_preds'): # Needed to convert NQ ids from np arrays to strings ids_all_str = [x.astype(str) for x in ids_all[head_num]] ids_all_list = [list(x) for x in ids_all_str] head_ids = ["-".join(x) for x in ids_all_list] preds_all[head_num], label_all[ head_num] = head.aggregate_preds( preds=preds_all[head_num], labels=label_all[head_num], passage_start_t=passage_start_t_all[head_num], ids=head_ids) result = { "loss": loss_all[head_num] / len(self.data_loader.dataset), "task_name": head.task_name } result.update( compute_metrics(metric=head.metric, preds=preds_all[head_num], labels=label_all[head_num])) # Select type of report depending on prediction head output type if self.report: try: result["report"] = compute_report_metrics( head, preds_all[head_num], label_all[head_num]) except: # logger.error(f"Couldn't create eval report for head {head_num} with following preds and labels:" # f"\n Preds: {preds_all[head_num]} \n Labels: {label_all[head_num]}") result["report"] = "Error" if return_preds_and_labels: result["preds"] = preds_all[head_num] result["labels"] = label_all[head_num] all_results.append(result) return all_results
def binarize_dataset(df, classes): binarizer = MultiLabelBinarizer(classes=classes, sparse_output=False) return binarizer.fit_transform(df)
train['genre_list'] = train['genre_list'].apply(lambda x: ast.literal_eval(x)) test['genre_list'] = test['genre_list'].apply(lambda x: ast.literal_eval(x)) val['genre_list'] = val['genre_list'].apply(lambda x: ast.literal_eval(x)) labels = {} for genre in test['genre_list']: if len(genre) in labels: labels[len(genre)] += 1 else: labels[len(genre)] = 1 mlb = MultiLabelBinarizer() mlb.fit(dataset['genre_list'].tolist()) transformed_labels = mlb.fit_transform(dataset['genre_list'].tolist()) train_labels = mlb.transform(train['genre_list'].tolist()) test_labels = mlb.transform(test['genre_list'].tolist()) val_labels = mlb.transform(val['genre_list'].tolist()) stop = stopwords.words('english') lemmatizer = WordNetLemmatizer() def clean_text(text): text = text.translate(str.maketrans('', '', punctuation)) text = text.lower().strip() text = ' '.join([
import time from code.lib.projectlib import make_train_set,metriccalculation, scorer, sort_by_frequency, last_index_of_freq from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import MultiLabelBinarizer from sklearn.ensemble import BaggingClassifier from sklearn.model_selection import cross_val_score from sklearn.externals import joblib from skmultilearn.problem_transform import ClassifierChain x_train, y_train = make_train_set("../../data", "training_dict_f_91.json", "labels.csv", second_features_videos=None, weighted=False) mlb = MultiLabelBinarizer(sparse_output=False) original_binlabels = mlb.fit_transform(y_train[:]) original_classes = list(mlb.classes_) binlabels, classes, class_frequencies = sort_by_frequency(original_binlabels, original_classes) # If a label have more than 5 occurences, it is considered number_of_labels = last_index_of_freq(class_frequencies, 5) y_train = binlabels[:, :number_of_labels] # create classifier algorithm = DecisionTreeClassifier() ensemble = BaggingClassifier(algorithm, random_state=10) classifier = ClassifierChain(ensemble) # run cross validation to evaluate the classifier start_ex = time.time() m=cross_val_score(classifier,x_train,y_train,scoring=scorer(metric=metriccalculation),cv=10) end_ex = time.time() train_time = end_ex - start_ex
def setup(train_files, test_files, specific): scalar = StandardScaler() mlb = MultiLabelBinarizer() train_labels = [] train_data = [] train_keys = [] for f in train_files.keys(): path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getFeature(song) if len(feat) != 391: continue train_keys.append(f) train_data.append(feat) train_labels.append(train_files[f]) print('finished train') train_labels = mlb.fit_transform(train_labels) train_data = scalar.fit_transform(train_data) print('finished transforming') path = constants.path + specific + '_mlb.pkl' dump(mlb, path) path = constants.path + specific + '_scalar.pkl' dump(scalar, path) path = constants.path + specific + '_train.pkl' data = dict() data['features'] = train_data data['labels'] = train_labels data['keys'] = train_keys dump(data, path) print('finished dumping') #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4) classifier = MultiOutputClassifier(RandomForestClassifier( n_estimators=20, class_weight='balanced'), n_jobs=4) classifier.fit(train_data, train_labels) print('finished fitting') data = 0 train_data = 0 train_labels = 0 train_keys = 0 gc.collect() #test_labels = [] test_data = [] test_keys = list(test_files.keys()) mean = scalar.mean_ for f in test_keys: path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getFeature(song) if len(feat) < 391: length = len(feat) for m in mean[length:]: feat += [m] test_data.append(feat) test_data = scalar.transform(test_data) predictions = classifier.predict(test_data) print('finished predictions') genre_predictions = mlb.inverse_transform(predictions) write(genre_predictions, test_keys, specific) print('finished writing predictions')
# tempX = pd.DataFrame(U[0]) # tempY = pd.DataFrame(U[1]) # In[47]: # Importing sklearn required libraries from sklearn.preprocessing import MultiLabelBinarizer from sklearn.feature_extraction.text import TfidfVectorizer from scipy.sparse import coo_matrix, hstack from sklearn.svm import LinearSVC # In[48]: multilabel_binarizer = MultiLabelBinarizer() y_bin = multilabel_binarizer.fit_transform(U_code[1].values) # In[49]: def evaluation_metrics(y_actual, y_predicted): accuracy = accuracy_score(y_actual, y_predicted) print("Accuracy:", accuracy) recall = recall_score(y_actual, y_predicted, average='micro') print("Recall:", recall) precision = precision_score(y_actual, y_predicted, average='micro') print("Precision:", precision) f1 = f1_score(y_actual, y_predicted, average='micro') print("F1-Score:", f1) hamming = hamming_loss(y_actual, y_predicted)
]) title_pd['from'] = title_pd['from'].astype(float) title_pd['favorites'] = title_pd['favorites'] / title_pd['members'] cols_to_multihot = ['producers', 'studios', 'genres'] temp = [['episodes', 'duration', 'scored_by', 'members', 'favorites'], list(title_pd.loc[:, 'opening_themes':'Spin-off'].columns)] cols_to_num = [item for elem in temp for item in elem] cols_to_onehot = ['type', 'source', 'rating', 'season'] one_hot = OneHotEncoder(sparse=False) mlb1 = MultiLabelBinarizer(sparse_output=False) mlb2 = MultiLabelBinarizer(sparse_output=False) mlb3 = MultiLabelBinarizer(sparse_output=False) scaler = StandardScaler() mlb_pd = pd.concat([ pd.DataFrame(title_pd['index']), pd.DataFrame(mlb1.fit_transform(title_pd['producers']), columns=mlb1.classes_ + '_p'), pd.DataFrame(mlb2.fit_transform(title_pd['studios']), columns=mlb2.classes_), pd.DataFrame(mlb3.fit_transform(title_pd['genres']), columns=mlb3.classes_ + '_g') ], axis=1, sort=False) train, test = train_test_split(title_pd, test_size=0.2, random_state=228) #test is transformed implicitly to avoid copying the code below by setting train=test train = train.sort_values(by=['index']) mlb_pd = mlb_pd[mlb_pd['index'].isin(train['index'])] trans_train = pd.concat([ pd.DataFrame(train['index'], columns=['index']), pd.DataFrame(scaler.fit_transform(train[cols_to_num]),
label = imagePath.split(os.path.sep)[-2].split("_") labels.append(label) data = np.array(data, dtype="float") labels = np.array(labels) Labels_verbal = labels print("[INFO] Private data images loaded!") print("Reshaping data!") print("Data Reshaped to feed into models channels last") from sklearn.preprocessing import MultiLabelBinarizer print("Labels formatting") lb = MultiLabelBinarizer() labels = lb.fit_transform(labels) print("Labels ok!") #%% time1 = time.time() #initiate time counter n_split = 5 #10fold cross validation scores = [] #here every fold accuracy will be kept predictions_all = np.empty(0) # here, every fold predictions will be kept test_labels = np.empty(0) #here, every fold labels are kept omega = 1 for train_index, test_index in KFold(n_split).split(data): trainX, testX = data[train_index], data[test_index] trainY, testY = labels[train_index], labels[test_index]
def main(): parser = ArgumentParser("scoring", formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument("--emb", required=True, help='Embeddings file') parser.add_argument("--network", required=True, help='A .mat file containing the adjacency matrix and node labels of the input network.') parser.add_argument("--adj-matrix-name", default='network', help='Variable name of the adjacency matrix inside the .mat file.') parser.add_argument("--label-matrix-name", default='group', help='Variable name of the labels matrix inside the .mat file.') parser.add_argument("--num-shuffles", default=2, type=int, help='Number of shuffles.') parser.add_argument("--all", default=False, action='store_true', help='The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. ' 'By default, only training percents of 10, 50 and 90 are used.') args = parser.parse_args() # 0. Files embeddings_file = args.emb matfile = args.network # 1. Load Embeddings model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False) # 2. Load labels mat = loadmat(matfile) A = mat[args.adj_matrix_name] graph = sparse2graph(A) labels_matrix = mat[args.label_matrix_name] labels_count = labels_matrix.shape[1] mlb = MultiLabelBinarizer(range(labels_count)) # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))]) # 2. Shuffle, to create train/test groups shuffles = [] for x in range(args.num_shuffles): shuffles.append(skshuffle(features_matrix, labels_matrix)) # 3. to score each train/test group all_results = defaultdict(list) if args.all: training_percents = numpy.asarray(range(1, 10)) * .1 else: training_percents = [0.1, 0.5, 0.9] for train_percent in training_percents: for shuf in shuffles: X, y = shuf training_size = int(train_percent * X.shape[0]) X_train = csr_matrix(X[:training_size, :]) y_train_ = csr_matrix(y[:training_size]) y_train = [[] for x in range(y_train_.shape[0])] cy = y_train_.tocoo() for i, j in zip(cy.row, cy.col): y_train[i].append(j) assert sum(len(l) for l in y_train) == y_train_.nnz X_test = csr_matrix(X[training_size:, :]) y_test_ = csr_matrix(y[training_size:]) y_test = [[] for _ in range(y_test_.shape[0])] cy = y_test_.tocoo() for i, j in zip(cy.row, cy.col): y_test[i].append(j) clf = TopKRanker(LogisticRegression()) clf.fit(X_train, y_train_) # find out how many labels should be predicted top_k_list = [len(l) for l in y_test] preds = clf.predict(X_test, top_k_list) results = {} averages = ["micro", "macro"] for average in averages: results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average) all_results[train_percent].append(results) print('Results, using embeddings of dimensionality', X.shape[1]) print('-------------------') for train_percent in sorted(all_results.keys()): print('Train percent:', train_percent) for index, result in enumerate(all_results[train_percent]): print('Shuffle #%d: ' % (index + 1), result) avg_score = defaultdict(float) for score_dict in all_results[train_percent]: for metric, score in iteritems(score_dict): avg_score[metric] += score for metric in avg_score: avg_score[metric] /= len(all_results[train_percent]) print('Average score:', dict(avg_score)) print('-------------------')
y = data.iloc[:,2:] print(repr(data)) x=np.array([point[0] for point in data[:300] if point[0]!='comment_text']) y=np.array([[point[i] for i in range(2,len(point))] for point in data[:300] if point[0]!='comment_text']) print(len(x)) print(len(y)) X_TRAIN, X_TEST , Y_TRAIN, Y_TEST = train_test_split(x,y,test_size=.2) return X_TRAIN, X_TEST, Y_TRAIN, Y_TEST train_data, test_data, train_labels, test_labels = loadData('training.csv') mlb = MultiLabelBinarizer() binary_train_labels = mlb.fit_transform(train_labels) binary_test_labels = mlb.transform(test_labels) vectorizer = TfidfVectorizer() vectorised_train_data = vectorizer.fit_transform(train_data) vectorised_test_data = vectorizer.transform(test_data) print(vectorised_test_data) print(vectorised_train_data) print(train_labels) print(test_labels) print("ok1") classifier = BinaryRelevance(GaussianNB()) print("ok2") classifier.fit(vectorised_train_data.todense(), train_labels)
parser.add_argument('--max-df', default=1.0, type=float) parser.add_argument('--min-df', default=1, type=int) return parser if __name__ == '__main__': parser = get_parser() args = parser.parse_args() stop_list = stopwords.words('english') all_questions = [] all_labels = [] with open(args.questions, 'r') as f: for line in f: doc = json.loads(line) question = "{} {}".format(doc['Title'], doc['Body']) all_questions.append(question) all_labels.append(doc['Tags'].split(' ')) vectorizer = TfidfVectorizer(min_df=args.min_df, stop_words=stop_list, max_df=args.max_df) X = vectorizer.fit_transform(all_questions) mlb = MultiLabelBinarizer() y = mlb.fit_transform(all_labels) classifier = OneVsRestClassifier(LinearSVC()) scores = cross_val_score(classifier, X, y=y, cv=10, scoring='f1_micro') print("Average F1: {}".format(np.mean(scores)))
one_hot.inverse_transform(one_hot.transform(feature))) # Import library import pandas as pd # Create dummy variables from feature print("Matrix encoding with pandas: \n", pd.get_dummies(feature[:, 0])) # Create multiclass feature multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"), ("Texas", "Florida"), ("Delware", "Florida"), ("Texas", "Alabama")] # Create multiclass one-hot encoder one_hot_multiclass = MultiLabelBinarizer() # One-hot encode multiclass feature print("Multi label binarizer: \n", one_hot_multiclass.fit_transform(multiclass_feature)) print("Feature classes: \n", one_hot_multiclass.classes_) print("\n") print("\n") print("\n") #5.2 Encoding Ordinal Categorical Features # Load library import pandas as pd # Create features dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]}) # Create mapper scale_mapper = {"Low": 1, "Medium": 2, "High": 3}
def mycode(train_files, test_files, specific, indicies): indicies = np.array(indicies) #indicies = indicies[:len(indicies)//4] scalar = StandardScaler() mlb = MultiLabelBinarizer() train_labels = [] train_data = [] train_keys = [] keys = list(train_files.keys()) random.shuffle(keys) subset = 150000 #len(keys) count = 0 for f in keys[:subset]: count += 1 path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getAllFeatures(song) if len(feat) != 2647: continue feat = np.array(feat) feat = feat[indicies] train_keys.append(f) train_data.append(feat) train_labels.append(train_files[f]) if count % 10000 == 0: print("on ", count, "length of keys: ", len(train_keys)) print('finished train') train_labels = mlb.fit_transform(train_labels) train_data = scalar.fit_transform(train_data) print('finished transforming') path = constants.path + specific + '_all2_mlb.pkl' dump(mlb, path) path = constants.path + specific + '_all2_scalar.pkl' dump(scalar, path) print(np.shape(train_data)) path = constants.path + specific + '_all2_train.pkl' data = dict() data['features'] = train_data data['labels'] = train_labels data['keys'] = train_keys dump(data, path) print('finished dumping') #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4) classifier = MultiOutputClassifier(RandomForestClassifier( n_estimators=32, class_weight='balanced'), n_jobs=4) data = 0 train_files = 0 train_keys = 0 keys = 0 gc.collect() classifier.fit(train_data, train_labels) print('finished fitting') path = constants.path + specific + '_all2_classifier.pkl' dump(classifier, path) """ with open(constants.path + specific + '_all2_scalar.pkl', 'rb') as data_file: scalar = pickle.load(data_file) with open(constants.path + specific + '_all2_mlb.pkl', 'rb') as data_file: mlb = pickle.load(data_file) with open(constants.path + specific + '_all2_classifier.pkl', 'rb') as data_file: classifier = pickle.load(data_file) """ data = 0 train_data = 0 train_labels = 0 train_keys = 0 gc.collect() #test_labels = [] test_data = [] test_keys = list(test_files.keys()) mean = scalar.mean_ for f in test_keys: path = constants.path + 'acousticbrainz-mediaeval-train/' + f[: 2] + '/' + f + '.json' song = readjson(path) feat = getAllFeatures(song) """ if len(feat) < 2647: length = len(feat) for m in mean[length:]: feat += [m] """ #feat = np.array(feat) if len(feat) < 2647: length = len(feat) print('Before: ', length) for m in range(2647 - length): feat += [np.random.rand()] #m = mean[indicies.index(2647)] #feat += [m] print('After: ', len(feat)) feat = np.array(feat) feat = feat[indicies] test_data.append(feat) test_data = scalar.transform(test_data) predictions = classifier.predict(test_data) print('finished predictions') genre_predictions = mlb.inverse_transform(predictions) write(genre_predictions, test_keys, specific) print('finished writing predictions')
# load libraries from sklearn.preprocessing import MultiLabelBinarizer import numpy as np # Create NumPy array y = [('Texas', 'Florida'), ('California', 'Alabama'), ('Texas', 'Florida'), ('Delware', 'Florida'), ('Texas', 'Alabama')] # Create MultiLabelBinarizer object one_hot = MultiLabelBinarizer() # One-hot encode data one_hot.fit_transform(y) # view classes one_hot.classes_
def run(n_splits=5): """ Prepare the Brexit Blog Corpus data set for analysis :param n_splits: int, the number of train/test splits to degenerate, default=5 :return: """ print('Reading and processing the xlsx file...', end='') brexit_blog_corpus = pd.read_excel('brexit_blog_corpus.xlsx') # fix up some typos brexit_blog_corpus.replace('concession/contrarines', np.nan, inplace=True) brexit_blog_corpus.replace('hypotheticallity', 'hypotheticality', inplace=True) # unfortunately, quite a few utterances are duplicates :( clean_dataset = brexit_blog_corpus.drop_duplicates(subset='Utterance') stance_columns = [ 'Stance category', 'second stance category', 'third', 'fourth', 'fifth' ] clean_dataset = clean_dataset[['Utterance ID No', 'Utterance'] + stance_columns].set_index('Utterance ID No') # extract the stance categories and do some cleaning stance_categories = set(clean_dataset[stance_columns].values.flatten()) stance_categories.discard(np.nan) stance_categories = sorted(list(stance_categories)) stance_categories = [ w.replace(' ', '-').replace('/', '-') for w in stance_categories ] # one-hot encode the assigned stance labels mlb = MultiLabelBinarizer() k_hot_encoded_stances = mlb.fit_transform( [x[~pd.isnull(x)] for x in clean_dataset[stance_columns].values]) k_hot_encoded_stances = pd.DataFrame(index=clean_dataset.index, data=k_hot_encoded_stances, columns=list(mlb.classes_)) k_hot_encoded_stances.columns = stance_categories # join the one-hot encoded labels and utterances back together again clean_dataset_one_hot = clean_dataset[['Utterance', 'Stance category']] \ .join(k_hot_encoded_stances) print('done.') print('Tokenising the utterances...', end='') # tokenize the Utterance tokenizer = NISTTokenizer() clean_dataset_one_hot.Utterance = clean_dataset_one_hot.Utterance.apply( lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True))) print('done.') print('Constructing train/test split and saving to disk...', end='') # split the data into train and test sets in the ratio 80:20 stance_columns = set(clean_dataset_one_hot.columns).difference( ['Utterance', 'Stance category']) stance_columns = sorted(list(stance_columns)) # first split the data in two to get train and test sets reset_seeds() X_train, X_test, y_train, y_test = \ train_test_split(clean_dataset_one_hot['Utterance'], clean_dataset_one_hot[stance_columns], test_size=0.2, stratify=clean_dataset_one_hot['Stance category']) y_train['set'] = 'train' y_test['set'] = 'test' dataset = pd.concat([ pd.DataFrame(data={ 'Utterance': X_train }).join(y_train), pd.DataFrame(data={ 'Utterance': X_test }).join(y_test) ], axis=0) dataset.to_csv('bbc_dataset.csv') print('done.') print('Constructing the cv folds and saving to disk...', end='') X_train_folds = pd.DataFrame( index=X_train.index, columns=['fold_{}'.format(i) for i in range(1, n_splits + 1)]) skf = StratifiedKFold(n_splits=n_splits) y = clean_dataset_one_hot.loc[y_train.index, 'Stance category'] for i, (train_idx, test_idx) in enumerate(skf.split(np.zeros(X_train.shape[0]), y)): X_train_folds.iloc[train_idx, i] = 'train' X_train_folds.iloc[test_idx, i] = 'test' X_train_folds.to_csv('bbc_dataset_folds.csv') print('done.') print('Pre-computing the ELMO embeddings and saving to disk...', end='') elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False) with tf.Session() as session: session.run(tf.global_variables_initializer()) elmo_train_embeddings = session.run( elmo(tf.squeeze(tf.cast(X_train.values, tf.string)), signature='default', as_dict=True)['default']) elmo_train_embeddings = pd.DataFrame(index=X_train.index, data=elmo_train_embeddings) elmo_train_embeddings.to_csv('bbc_elmo_train_embeddings.csv') elmo_test_embeddings = session.run( elmo(tf.squeeze(tf.cast(X_test.values, tf.string)), signature='default', as_dict=True)['default']) elmo_test_embeddings = pd.DataFrame(index=X_test.index, data=elmo_test_embeddings) elmo_test_embeddings.to_csv('bbc_elmo_test_embeddings.csv') print('done.')
df = pd.read_csv(data_path / "scraped" / "bgg_GameItem.csv", index_col="bgg_id") df.shape # %% df.sample(5, random_state=SEED).T # %% df.num_votes.sum() # %% mlb = MultiLabelBinarizer() values = mlb.fit_transform( df.game_type.apply( lambda x: [str(x)] if isinstance(x, float) and pd.notna(x) else x.split(",") if isinstance(x, str) and x else [] ) ) values.shape # %% gt_df = pd.DataFrame(data=values, columns=mlb.classes_, index=df.index) gt_df.shape # %% games = df.join(gt_df[list(game_types)].rename(columns=game_types)) games.shape # %%
i += 1 if i <= total_valid_data * train_portion: train_data.append(content) train_target.append(labels) else: test_data.append(content) test_target.append(labels) print "训练样本量", len(train_data), len(train_target) print "测试样本量", len(test_data), len(test_target) print("Loading newsgroups training set... ") print("Extracting features from the dataset using a sparse vectorizer") t0 = time() X_train = np.array((train_data)) print "eee", type(X_train) print("done in %fs" % (time() - t0)) y_train = mlb.fit_transform(train_target) print("An ordering for the training class labels:") print list(mlb.classes_) print("Loading newsgroups test set... ") print("Extracting features from the dataset using the same vectorizer") t0 = time() X_test = np.array((test_data)) y_test = mlb.fit_transform(test_target) print("An ordering for the test class labels:") print list(mlb.classes_) classifier = Pipeline([('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) print("done in %fs" % (time() - t0)) classifier.fit(X_train, y_train)
X_valid] = parser('./CleanData/crypto_light.csv') [y_train, y_test, y_valid, X_train, X_test, X_valid] = [ y_train[:165], y_test[:20], y_valid[:15], X_train[:165], X_test[:20], X_valid[15] ] # Multilabel Binarizer mlb = MultiLabelBinarizer() # print "===========Training Data==============" # print X_train # print y_train # print type(y_train[0]) # print y_train[1] # print y_train[2] # y_train = [['New York'],['New York'],['New York'],['New York'],['New York'],['New York'],['London'],['London'], ['London'],['London'],['London'],['London'],['New York', 'London'],['New York', 'London'] ] y_train = mlb.fit_transform(y_train) # print y_train print "classes", list(mlb.classes_) print len(list(mlb.classes_)) print len(X_train) # print "-----Binarize y_train----------" # print y_train # Pipeline(vectorization, tfid weighting and classifier) # ppl = Pipeline([ # ('vectorizer', HashingVectorizer()), # ('tfidf', TfidfTransformer()), # ('clf', OneVsRestClassifier(LinearSVC()))]) ppl = Pipeline([('vectorizer', HashingVectorizer()), ('clf', OneVsRestClassifier(LinearSVC()))])
data = pd.read_csv(args.train_data_path) data['content'] = data.content.map( lambda x: ''.join(x.strip().split())) # 去掉换行等 # 把主题和情感拼拼接起来 一共10*3类 原本是主题为十分类 情感是三分类 现在整成30分类 data['label'] = data['subject'] + data['sentiment_value'].astype(str) subj_lst = list(filter(lambda x: x is not np.nan, list(set(data.label)))) subj_lst_dict = {value: key for key, value in enumerate(subj_lst)} data['label'] = data['label'].apply(lambda x: subj_lst_dict.get(x)) # 多标签 data_tmp = data.groupby('content').agg({ 'label': lambda x: set(x) }).reset_index() mlb = MultiLabelBinarizer() # 转为类似于one-hot data_tmp['hh'] = mlb.fit_transform(data_tmp.label).tolist() y_train = np.array(data_tmp.hh.tolist()) # 构建embedding bow = BOW(data_tmp.content.apply(jieba.lcut).tolist(), min_count=1, maxlen=100) # 长短补齐 固定长度为100 vocab_size = len(bow.word2idx) # print(vocab_size) # 19887 # print(bow.word_count) # 统计每个词出现的次数 embedding_matrix = np.zeros((vocab_size + 1, 300)) # 加载词向量 embedding = load_embedding() for key, value in bow.word2idx.items(): if key in embedding.keys(): embedding_matrix[value] = embedding[key]
] df = pd.read_table('../data/meddra_all_se.tsv', names=columns) df.drop(df[df.meddra_type == "LLT"].index, inplace=True) print(df.info()) df = df.groupby('stitch_id_flat').side_effect_name.apply(list).reset_index() df['pubchem_id'] = df.stitch_id_flat.map(stitch_to_pubchem) print(df.head()) d2 = pd.read_excel("../data/2d_prop.xlsx") d3 = pd.read_excel("../data/3d_prop.xlsx") print(d2.shape, d3.shape) d2 = d2.select_dtypes(include=['int64', 'float64']) d3 = d3.select_dtypes(include=['float64']) y = mlb.fit_transform(sedf['side_effect_name']) print(y.shape) se = pd.read_excel('..data/sub_sys.xlsx') #se.info() se.info() print(sorted(se.count().values, reverse=True)) test_cols = list(se.columns) test_cols_update = test_cols[:] df = se[test_cols_update] print(list(df.columns)) #.index('blood') print(len(df.columns))
def multilabel2(total): X = total.frase y = total[["clase1", "clase2"]] y = y.replace(np.nan, '', regex=True) X = total.iloc[:, [0]] y = total.iloc[:, [1, 2]] y = y.replace(np.nan, '', regex=True) X = np.array(X) y = np.array(y) pipeline = Pipeline([ ('vectorize', CountVectorizer()), ('tf_idf', TfidfTransformer()), ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber'))) ]) mlb = MultiLabelBinarizer() scores = [] kf = KFold(n_splits=10, random_state=0, shuffle=True) for train, test in kf.split(total): X_train = total.iloc[train, [0]] X_train = np.array(X_train) y_train = total.iloc[train, [1, 2]] y_train = np.array(y_train) X_test = total.iloc[test, [0]] X_test = np.array(X_test) y_test = total.iloc[test, [1, 2]] y_test = np.array(y_test) aux = [] for test in X_test: aux.append(test[0]) X_test = aux aux = [] for train in X_train: aux.append(train[0]) X_train = aux y_train = mlb.fit_transform(y_train) y_test = mlb.transform(y_test) pipeline.fit(X_train, y_train) predicted = pipeline.predict(X_test) scores.append(evaluacion(y_test, predicted)) recall = metrics.recall_score(y_test, predicted, average='macro') print("Recall: %f" % recall) precision = metrics.precision_score(y_test, predicted, average='macro') print("Precision: %f" % precision) f1_score = metrics.f1_score(y_test, predicted, average='macro') print("F1-score: %f" % f1_score) accuracy = metrics.accuracy_score(y_test, predicted) print("accuracy: %f" % accuracy) return recall, precision, f1_score, accuracy
# https://medium.com/@michaeldelsole/what-is-one-hot-encoding-and-how-to-do-it-f0ae272f1179 from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.preprocessing import MultiLabelBinarizer import numpy as np import pandas as pd dataset = pd.read_csv('/home/ubuntu/keras/enver/dmlvh2/data2.csv') Y = dataset.iloc[:,1].values mlb = MultiLabelBinarizer() YY = mlb.fit_transform(Y) print(YY) np.savetxt('Y2.csv',YY, fmt='%d')
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='size of validation set (default 1000)') parser.add_argument('-ne', '--n_epoch', type=int, default=100, help='num of epoches (default 100)') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('-cv', '--cross_validation', type=int, help='k-fold cross validation') parser.add_argument('-mlc', '--multilabel_clf', action='store_true', help='multilabel classification flag') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = [train_doc_labels[i] for i in train_doc_codes] X_test = np.r_[test_doc_codes.values()] Y_test = [test_doc_labels[i] for i in test_doc_codes] # # DBN # X_train = np.array(load_pickle(args.train_doc_codes)) # Y_train = load_pickle(args.train_doc_labels) # X_test = np.array(load_pickle(args.test_doc_codes)) # Y_test = load_pickle(args.test_doc_labels) # import pdb;pdb.set_trace() if args.multilabel_clf: encoder = MultiLabelBinarizer() encoder.fit(Y_train + Y_test) Y_train = encoder.transform(Y_train) Y_test = encoder.transform(Y_test) else: Y = Y_train + Y_test n_train = len(Y_train) n_test = len(Y_test) encoder = LabelEncoder() Y = np_utils.to_categorical(encoder.fit_transform(Y)) Y_train = Y[:n_train] Y_test = Y[-n_test:] seed = 7 np.random.seed(seed) if not args.cross_validation: val_idx = np.random.choice(range(X_train.shape[0]), args.n_val, replace=False) train_idx = list(set(range(X_train.shape[0])) - set(val_idx)) X_new_train = X_train[train_idx] Y_new_train = Y_train[train_idx] X_new_val = X_train[val_idx] Y_new_val = Y_train[val_idx] print 'train: %s, val: %s, test: %s' % ( X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0]) if args.multilabel_clf: results = multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print 'f1 score on test set: macro_f1: %s, micro_f1: %s' % tuple( results) else: results = multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed) print 'acc on test set: %s' % results else: X = np.concatenate((X_train, X_test), axis=0) Y = np.concatenate((Y_train, Y_test), axis=0) ss = ShuffleSplit(n_splits=int(args.cross_validation), test_size=X_test.shape[0], random_state=seed) results = [] for train_idx, test_idx in ss.split(X): val_idx = np.random.choice(train_idx, args.n_val, replace=False) new_train_idx = list(set(train_idx) - set(val_idx)) X_new_train = X[new_train_idx] Y_new_train = Y[new_train_idx] X_new_val = X[val_idx] Y_new_val = Y[val_idx] if args.multilabel_clf: results.append(multilabel_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X_test, Y_test, nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)) else: results.append(multiclass_classifier(X_new_train, Y_new_train, X_new_val, Y_new_val, \ X[test_idx], Y[test_idx], nb_epoch=args.n_epoch, batch_size=args.batch_size, seed=seed)) if args.multilabel_clf: macro_f1, micro_f1 = zip(*results) macro_mean = np.mean(macro_f1) macro_std = np.std(macro_f1) micro_mean = np.mean(micro_f1) micro_std = np.std(micro_f1) print 'f1 score on %s-fold cross validation: macro_f1: %s (%s), micro_f1: %s (%s)' \ % (int(args.cross_validation), macro_mean, macro_std, micro_mean, micro_std) else: mean = np.mean(results) std = np.std(results) print 'acc on %s-fold cross validation: %s (%s)' % (int( args.cross_validation), mean, std) import pdb pdb.set_trace()
class Dataset(object): def __init__(self, inputs, labels, test_indices=None, **kwargs): """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it easy to serialize and deserialize everything as a unit. Args: inputs: The raw model inputs. This can be set to None if you dont want to serialize this value when you save the dataset. labels: The raw output labels. test_indices: The optional test indices to use. Ideally, this should be generated one time and reused across experiments to make results comparable. `generate_test_indices` can be used generate first time indices. **kwargs: Additional key value items to store. """ self.X = np.array(inputs) self.y = np.array(labels) for key, value in kwargs.items(): setattr(self, key, value) self._test_indices = None self._train_indices = None self.test_indices = test_indices self.is_multi_label = isinstance(labels[0], (set, list, tuple)) self.label_encoder = MultiLabelBinarizer( ) if self.is_multi_label else LabelBinarizer() self.y = self.label_encoder.fit_transform(self.y).flatten() def update_test_indices(self, test_size=0.1): """Updates `test_indices` property with indices of `test_size` proportion. Args: test_size: The test proportion in [0, 1] (Default value: 0.1) """ if self.is_multi_label: self._train_indices, self._test_indices = sampling.multi_label_train_test_split( self.y, test_size) else: sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size) self._train_indices, self._test_indices = next( sss.split(self.X, self.y)) def save(self, file_path): """Serializes this dataset to a file. Args: file_path: The file path to use. """ utils.dump(self, file_path) def train_val_split(self, split_ratio=0.1): """Generates train and validation sets from the training indices. Args: split_ratio: The split proportion in [0, 1] (Default value: 0.1) Returns: The stratified train and val subsets. Multi-label outputs are handled as well. """ if self.is_multi_label: train_indices, val_indices = sampling.multi_label_train_test_split( self.y, split_ratio) else: sss = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio) train_indices, val_indices = next(sss.split(self.X, self.y)) return self.X[train_indices], self.X[val_indices], self.y[ train_indices], self.y[val_indices] @staticmethod def load(file_path): """Loads the dataset from a file. Args: file_path: The file path to use. Returns: The `Dataset` instance. """ return utils.load(file_path) @property def test_indices(self): return self._test_indices @test_indices.setter def test_indices(self, test_indices): if test_indices is None: self._train_indices = np.arange(0, len(self.y)) else: self._test_indices = test_indices self._train_indices = np.setdiff1d(np.arange(0, len(self.y)), self.test_indices) @property def train_indices(self): return self._train_indices @property def labels(self): return self.label_encoder.classes_ @property def num_classes(self): if len(self.y.shape) == 1: return 1 else: return len(self.labels)
"""If you can't find it, we need to understand how did it happen that we lost them? It happened during the built-in tokenization of TfidfVectorizer. Luckily, we can influence on this process. Get back to the function above and use '(\S+)' regexp as a *token_pattern* in the constructor of the vectorizer. Now, use this transormation for the data and check again. """ tfidf_reversed_vocab[1976] ######### YOUR CODE HERE ############# """### MultiLabel classifier As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose it is convenient to use [MultiLabelBinarizer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html) from *sklearn*. """ from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys())) y_train = mlb.fit_transform(y_train) y_val = mlb.fit_transform(y_val) """Implement the function *train_classifier* for training a classifier. In this task we suggest to use One-vs-Rest approach, which is implemented in [OneVsRestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html) class. In this approach *k* classifiers (= number of tags) are trained. As a basic classifier, use [LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). It is one of the simplest methods, but often it performs good enough in text classification tasks. It might take some time, because a number of classifiers to train is large.""" from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression, RidgeClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.decomposition.nmf import NMF from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import AdaBoostClassifier def train_classifier(X_train, y_train): """
def prepare_data_frame(visu=True, load_raw=False, filename="predict_movie_categories_dataframe.csv", save=True): df = None labels = None mb = None print("start prepare data") start = timeit.default_timer() genres_with_occurences = {} words_with_occurences = {} if load_raw: df = processing.clean(processing.raw()) df = df[['id', 'belongs_to_collection', 'genres', 'original_title', 'overview', 'tagline', 'title']] df['belongs_to_collection'] = df['belongs_to_collection'] \ .apply(lambda x: x['name'] if 'name' in x.keys() else '') df['genres'] = df['genres'].apply(lambda x: [e['name'] for e in x]) keywords = keywords_processing.get_films_with_keywords(keywords_processing.raw()) df = pd.merge(df, keywords, how='left', on=['id']) genres_with_occurences = {} df['genres'].apply(lambda x: collect_occurences(x, genres_with_occurences)) genres_with_occurences = {x: y for x, y in sorted(genres_with_occurences.items(), key=lambda e: e[1], reverse=True)[:10] } print(genres_with_occurences) df['genres'] = df['genres'].apply(lambda x: delIfNotKnown(x, genres_with_occurences.keys())) df = df[df['genres'].apply(lambda x: len(x) > 0)] # Prepare labels (as a 2D binary array) mb = MultiLabelBinarizer() labels = mb.fit_transform(df['genres']) df['keywords'] = df['keywords'].apply(clean_text) df['overview'] = df['overview'].apply(clean_text) df['tagline'] = df['tagline'].apply(clean_text) df['title_not_modified'] = df['title'] df['title'] = df['title'].apply(clean_text) df['original_title'] = df['original_title'].apply(clean_text) df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: x.replace(' ', '')) words_with_occurences = {} df['overview'].apply(lambda x: collect_occurences(x, words_with_occurences)) words_with_occurences = {x: y for x, y in sorted(words_with_occurences.items(), key=lambda e: e[1], reverse=True) } df['clean_x'] = df.apply(join, axis=1) df = df[['title_not_modified', 'clean_x', 'genres']] if save: df.to_csv(up.data_processed_dir + filename) else: df = pd.read_csv(up.data_processed_dir + filename) df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x)) mb = MultiLabelBinarizer() labels = mb.fit_transform(df['genres']) end = timeit.default_timer() print("End prepare data, time : ", end - start) if visu: print(df.head()) print(df.columns) print(df.values[1]) if genres_with_occurences != {}: print('10 most present genres with their occurences :', genres_with_occurences) genres_df = pd.DataFrame({'Genre': list(genres_with_occurences.keys()), 'Occurences': list(genres_with_occurences.values())}) \ .set_index('Genre').rename_axis(None) genres_df = genres_df.sort_values('Occurences', ascending=True) genres_df.plot.barh() plt.show() if words_with_occurences != {}: print('Words with their occurences : ', words_with_occurences) words_df = pd.DataFrame({'Word': list(words_with_occurences.keys())[:100], 'Occurences': list(words_with_occurences.values())[:100]}) \ .set_index('Word').rename_axis(None) words_df = words_df.sort_values('Occurences', ascending=True) words_df.plot.barh(figsize=(15, 20)) plt.show() print("Dataframe shape : ", df.shape) print(df.info()) print("Labels") print(labels) print("Labels shape : ", labels.shape) print("For example, ", labels[0], "Stands for", mb.inverse_transform(labels)[0]) return df, mb, labels
def train(self, max_iterations=10000, learning_rate=5e-4, units=32, hold_prob=1, gpu_id='/cpu:0'): os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( [str(n) for n in range(self.static_data['ngpus'])]) print('lstm strart') H_train, H_val, H_test = self.create_inputs(self.X_train, self.X_val, self.X_test) if not self.probabilistic: y_val = self.y_val y_test = self.y_test y_train = self.y_train else: classes = np.arange(0.1, 20, 0.2) y_val = np.digitize(self.y_val, classes, right=True) y_test = np.digitize(self.y_test, classes, right=True) y_train = np.digitize(self.y_train, classes, right=True) binarizer = MultiLabelBinarizer(classes=classes) y_train = binarizer.fit_transform(y_train) y_val = binarizer.transform(y_val) y_test = binarizer.transform(y_test) batch_size = np.min([100, int(self.N / 5)]) tf.compat.v1.reset_default_graph() graph_lstm = tf.Graph() with graph_lstm.as_default(): with tf.device(gpu_id): x1 = tf.compat.v1.placeholder( 'float', shape=[None, H_train.shape[1], H_train.shape[2]], name='input_data') y_pred_ = tf.compat.v1.placeholder( tf.float32, shape=[None, y_train.shape[1]], name='target_lstm') with tf.device(gpu_id): train_lstm, cost_lstm, accuracy_lstm, sse_lstm, rse_lstm, weights = self.build_graph( x1, y_pred_, learning_rate, units, hold_prob) obj_old = np.inf * np.ones(4) obj_max = np.inf * np.ones(4) obj_min = np.inf * np.ones(4) batches = [ np.random.choice(self.N, batch_size, replace=False) for _ in range(max_iterations + 1) ] path_group = self.static_data['path_group'] cpu_status = joblib.load(os.path.join(path_group, 'cpu_status.pickle')) if sys.platform != 'linux': config = tf.compat.v1.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=self.static_data['intra_op'], inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True else: config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True res = dict() self.best_weights = dict() best_iteration = 0 best_glob_iterations = 0 ext_iterations = max_iterations train_flag = True patience = 8000 wait = 0 loops = 0 with tf.compat.v1.Session(graph=graph_lstm, config=config) as sess: sess.run(tf.compat.v1.global_variables_initializer()) while train_flag: for i in tqdm(range(max_iterations)): if i % 500 == 0: sess.run([train_lstm], feed_dict={ x1: H_train[batches[i]], y_pred_: y_train[batches[i]] }) acc_new_v, mse_new_v, sse_new_v, rse_new_v, weights_lstm = sess.run( [ accuracy_lstm, cost_lstm, sse_lstm, rse_lstm, weights ], feed_dict={ x1: H_val, y_pred_: y_val, }) acc_new_t, mse_new_t, sse_new_t, rse_new_t = sess.run( [accuracy_lstm, cost_lstm, sse_lstm, rse_lstm], feed_dict={ x1: H_test, y_pred_: y_test }) acc_new = 0.4 * acc_new_v + 0.6 * acc_new_t mse_new = 0.4 * mse_new_v + 0.6 * mse_new_t sse_new = 0.4 * sse_new_v + 0.6 * sse_new_t rse_new = 0.4 * rse_new_v + 0.6 * rse_new_t obj_new = np.array( [acc_new, mse_new, sse_new, rse_new]) flag, obj_old, obj_max, obj_min = self.distance( obj_new, obj_old, obj_max, obj_min) if flag: variables_names = [ v.name for v in tf.compat.v1.trainable_variables() ] for k, v in zip(variables_names, weights_lstm): self.best_weights[k] = v res[str(i)] = obj_old print(acc_new) best_iteration = i wait = 0 else: wait += 1 if wait > patience: train_flag = False break else: sess.run(train_lstm, feed_dict={ x1: H_train[batches[i]], y_pred_: y_train[batches[i]] }) wait += 1 best_glob_iterations = ext_iterations + best_iteration if (max_iterations - best_iteration) <= 5000 and max_iterations > 2000: if loops > 3: best_glob_iterations = ext_iterations + best_iteration train_flag = False else: ext_iterations += 8000 max_iterations = 8000 best_iteration = 0 loops += 1 else: best_glob_iterations = ext_iterations + best_iteration train_flag = False sess.close() model_dict = dict() model_dict['units'] = units model_dict['hold_prob'] = hold_prob model_dict['best_weights'] = self.best_weights model_dict['static_data'] = self.static_data model_dict['n_vars'] = self.D1 model_dict['depth'] = self.D2 model_dict['best_iteration'] = best_glob_iterations model_dict['metrics'] = obj_old model_dict['error_func'] = res print("Total accuracy lstm-3d: %s" % obj_old[0]) return obj_old[0], self.scale_lstm, model_dict
for row in traindata: content = str(remove_stopwords(row[0].encode('utf-8'))) content_pk[content] = row[2] try: labels = set(row[1].split(';')) except: print 'error' test_data.append(content) test_target.append(labels) path = 'E://model.pkl' print "测试样本量", len(test_data), len(test_target) loaded_model = joblib.load(path) X_test = np.array((test_data)) y_test = mlb.fit_transform(test_target) print("An ordering for the test class labels:") print list(mlb.classes_) predicted = loaded_model.predict(X_test) # all_labels = mlb.inverse_transform(predicted) # for item, labels in zip(X_test, all_labels): # pk_main_news = content_pk[item] # # print('{0} => {1}'.format(item, ', '.join(labels))) # sqlcursor.execute("update cctv_news_content set tname_by_classifier = %s where pk=%s", ((';'.join(labels)),pk_main_news)) # sqlConn.commit() print("Classification report on test set for classifier:") print(classification_report(y_test, predicted)) # exit(0) # # file produce # # sqlcursor.execute("""SELECT 'MID', 'key_words', 'related_news','tname_by_classifier'
return [int(x) for x in str_label if len(x)>0] def convert_feature_to_vector(str_feature): str_feature = str_feature[1:-1] str_feature = str_feature.split(',') return [float(x) for x in str_feature] y_train = np.array([convert_label_to_array(y) for y in train_df['label']]) X_train = np.array([convert_feature_to_vector(x) for x in train_df['feature vector']]) X_test = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']]) t=time.time() mlb = MultiLabelBinarizer() train_set_labels= mlb.fit_transform(y_train) #Convert list of labels to binary matrix random_state = np.random.RandomState(0) train_set_features, test_set_features, train_set_labels, test_set_true_labels = train_test_split(X_train, train_set_labels, test_size=.2, random_state=random_state) classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) classifier.fit(train_set_features, train_set_labels) predicted_labels = classifier.predict(test_set_features) print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec" print "F1 score: ", f1_score(test_set_true_labels, predicted_labels, average='micro')
def eval_classification(self, truth_file, pred_files, eval_list, plot_list, over_all=False, ensemble_voting="soft", ensemble_weights=None, class_names=None, convert_binary=False, binary_threshold=None): """ This function calculates the evaluation measures required by the user for classification problems. Args: truth_file: A csv file containing ids and truth annotations. pred_files: A list of csv files containing ids and prediction annotations. eval_list: A list of evaluation measures. plot_list: A list of evaluation plots. over_all: if class wise results are required or overall result, in case of multiclass classification, default false. ensemble_voting: Type of voting in case of multiple prediction(soft/hard) default soft. ensemble_weights: Weights for each class in case of ensemble, default None. calss_names: An array containing class names, default None. convert_binary: If multiclass predictions should be evaluated as binary problem (normal vs abnormal)first value should represent probability of normal class. binary_threshold: threshold to be used in case of multiclass to binary conversion. Returns: A dictionary containing evaluation result. Raises: "Invalid evaluation term" if a term is not present in supported list. """ self.eval_result = {} self.classes = None self.eval_plots = [] self.read_data(truth_file, pred_files) eval_list = [element.strip().lower() for element in eval_list] if self.ensemble: self.eval_ensemble(ensemble_voting, ensemble_weights) if self.multilabel: self.truth = np.array([literal_eval(p) for p in self.truth]) self.pred = np.array([literal_eval(p) for p in self.pred]) classes = self.classes if class_names: classes = class_names if isinstance(self.truth[0], type("str")): self.truth = np.array([classes.index(tval) for tval in self.truth]) if not self.multilabel: if len(self.pred.shape ) > 1 and convert_binary and binary_threshold: self.pred_max = np.array([ 0 if prd_n[0] >= binary_threshold else 1 for prd_n in self.pred ]) self.truth = np.array( [1 if truth_n != 0 else truth_n for truth_n in self.truth]) classes = [self.classes[0], '!' + self.classes[0]] elif len(self.pred.shape) > 1: self.pred_max = np.argmax(self.pred, axis=1) else: self.pred_max = self.pred conf_matrix = metrics.confusion_matrix(self.truth, self.pred_max) true_pos = [0] * len(classes) false_pos = [0] * len(classes) false_neg = [0] * len(classes) true_neg = [0] * len(classes) col_sum = np.sum(conf_matrix, axis=0) row_sum = np.sum(conf_matrix, axis=1) cum_sum = np.sum(conf_matrix) for k in range(0, len(classes)): true_pos[k] += conf_matrix[k, k] false_pos[k] += col_sum[k] - true_pos[k] false_neg[k] += row_sum[k] - true_pos[k] true_neg[k] += cum_sum - true_pos[k] - \ false_pos[k] - false_neg[k] else: mlb = MultiLabelBinarizer() self.truth = mlb.fit_transform(self.truth) self.pred = mlb.transform(self.pred) classes = mlb.classes_ self.truth = self.truth * 2 np_sum = np.add(self.truth, self.pred) true_pos = [0] * len(classes) false_pos = [0] * len(classes) false_neg = [0] * len(classes) true_neg = [0] * len(classes) for i in range(0, len(classes)): true_pos[i] = np.sum(np_sum[:, i] == 3) true_neg[i] = np.sum(np_sum[:, i] == 0) false_pos[i] = np.sum(np_sum[:, i] == 1) false_neg[i] = np.sum(np_sum[:, i] == 2) # class wise evaluation for cls in classes: self.eval_result[cls] = {} for element in eval_list: if element in ['recall', 'true positive rate', 'sensitivity']: for i, cls in enumerate(classes): self.eval_result[cls]['recall'] = calc_recall( true_pos[i], false_neg[i]) elif element in ['specificity', 'true negative rate']: for i, cls in enumerate(classes): self.eval_result[cls]['specificity'] = calc_specificity( true_neg[i], false_pos[i]) elif element == 'accuracy': for i, cls in enumerate(classes): self.eval_result[cls]['accuracy'] = calc_acc( true_pos[i], true_neg[i], false_pos[i], false_neg[i]) elif element in ['f1_score', 'f1score', 'fscore']: for i, cls in enumerate(classes): self.eval_result[cls]['f1score'] = calc_f1score( true_pos[i], false_pos[i], false_neg[i]) elif element in ['precision', 'positive predictive value', 'ppv']: for i, cls in enumerate(classes): self.eval_result[cls]['precision'] = calc_precision( true_pos[i], false_pos[i]) elif element in ['negative predictive value', 'npv']: for i, cls in enumerate(classes): self.eval_result[cls]['npv'] = calc_npv( true_neg[i], false_neg[i]) elif element in ['false negative rate', 'fnr']: for i, cls in enumerate(classes): self.eval_result[cls]['fnr'] = calc_fnr( false_neg[i], true_pos[i]) elif element in ['false positive rate', 'fpr']: for i, cls in enumerate(classes): self.eval_result[cls]['fpr'] = calc_fpr( false_pos[i], true_neg[i]) elif element in ['false discovery rate', 'fdr']: for i, cls in enumerate(classes): self.eval_result[cls]['fdr'] = calc_fdr( true_pos[i], false_pos[i]) elif element in ['false omission rate', 'for']: for i, cls in enumerate(classes): self.eval_result[cls]['for'] = calc_for( false_neg[i], true_neg[i]) elif element in ['matthews correlatin coefficient', 'mcc']: for i, cls in enumerate(classes): self.eval_result[cls]['mcc'] = calc_mcc( true_pos[i], true_neg[i], false_pos[i], false_neg[i]) elif element in ['kappa']: for i, cls in enumerate(classes): self.eval_result[cls]['kappa'] = calc_kappa( self.truth, self.pred_max) else: raise ValueError("invalid Evaluation Term") if plot_list: self.eval_plot_classification(plot_list, classes, true_pos, false_pos, false_neg, true_neg) if over_all: self.calc_overall() return self.eval_result, self.eval_plots