def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array(self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df = 1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def print_report(name_classificator, testing_problems, testing_tags, predicted_problems, predicted_tags): predicted_problems, predicted_tags = make_right_order(testing_problems, predicted_problems, predicted_tags) mlb = MultiLabelBinarizer().fit(testing_tags + predicted_tags) testing_tags = mlb.transform(testing_tags) predicted_tags = mlb.transform(predicted_tags) print(name_classificator) print(classification_report(testing_tags, predicted_tags, target_names=mlb.classes_)) print('label ranking average precision score =', label_ranking_average_precision_score(testing_tags, predicted_tags)) print('\n', ('#'*100), '\n')
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels): all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \ "Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \ "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] health_labels = ["Epidemic", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", \ "injured_or_dead_people"] conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \ "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] import numpy as np curr_labels = all_labels trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels] testLabels = [list(set(l).intersection(curr_labels))for l in testLabels] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=curr_labels) train_label_matrix = mlb.fit(trainLabels) print("Labels : ", mlb.classes_) train_label_matrix = mlb.transform(trainLabels) test_label_matrix = mlb.transform(testLabels) print("Shape of label matrix : ", test_label_matrix.shape) train_matrix, tfidf = tf_idf_fit_transform(trainSentences) test_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", test_matrix.shape) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier # estimator = LinearSVC() estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1) classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, train_label_matrix) predictions = classifier.predict(test_matrix) from sklearn.metrics import f1_score, precision_score, recall_score print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None)) print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None)) print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
class TimeSeriesLabelTransformer(BaseTaskTransformer): def __init__(self, namespace, name, labels=None): '''Initialize a time-series label transformer Parameters ---------- jam : jams.JAMS The JAMS object container n_samples : int > 0 The number of samples in the audio frame label_encoder : sklearn.preprocessing.MultiLabelBinarizer The (pre-constructed) label encoder ''' super(TimeSeriesLabelTransformer, self).__init__(namespace, 0) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.name = name def transform(self, jam): ann = self.find_annotation(jam) intervals = np.asarray([[0.0, jam.file_metadata.duration]]) values = [None] mask = False if ann: ann_int, ann_val = ann.data.to_interval_values() intervals = np.vstack([intervals, ann_int]) values.extend(ann_val) mask = True # Suppress all intervals not in the encoder tags = [] for v in values: if v in self._classes: tags.extend(self.encoder.transform([[v]])) else: tags.extend(self.encoder.transform([[]])) tags = np.asarray(tags) target = self.encode_intervals(jam.file_metadata.duration, intervals, tags) return {'output_{:s}'.format(self.name): target, 'mask_{:s}'.format(self.name): mask}
def test_multilabel_classification_report(): n_classes = 4 n_samples = 50 make_ml = make_multilabel_classification _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) expected_report = """\ precision recall f1-score support 0 0.50 0.67 0.57 24 1 0.51 0.74 0.61 27 2 0.29 0.08 0.12 26 3 0.52 0.56 0.54 27 avg / total 0.45 0.51 0.46 104 """ lb = MultiLabelBinarizer() lb.fit([range(4)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def load_data(config={}): """ Load the Reuters dataset. Returns ------- data : dict with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels' """ stop_words = stopwords.words("english") vectorizer = TfidfVectorizer(stop_words=stop_words) mlb = MultiLabelBinarizer() documents = reuters.fileids() test = [d for d in documents if d.startswith('test/')] train = [d for d in documents if d.startswith('training/')] docs = {} docs['train'] = [reuters.raw(doc_id) for doc_id in train] docs['test'] = [reuters.raw(doc_id) for doc_id in test] xs = {'train': [], 'test': []} xs['train'] = vectorizer.fit_transform(docs['train']).toarray() xs['test'] = vectorizer.transform(docs['test']).toarray() ys = {'train': [], 'test': []} ys['train'] = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train]) ys['test'] = mlb.transform([reuters.categories(doc_id) for doc_id in test]) data = {'x_train': xs['train'], 'y_train': ys['train'], 'x_test': xs['test'], 'y_test': ys['test'], 'labels': globals()["labels"]} return data
def fit_images(): client = pymongo.MongoClient('localhost', 27017) db = client['image_annotation'] responses = db['mapped_responses'].find() no_labels = db['labels_binary'].find() numbers = [] for i in no_labels: numbers.append(set([int(i["number"])])) train_data = [] labels = [] i=0 mlb = MultiLabelBinarizer() mlb.fit(numbers) for index, instance in enumerate(responses): t_data = instance['hist']['0'] indexes[index] = instance['image_no'] train_data.append(t_data) label = instance['binary_results'] new_labels = [] for key, value in enumerate(label): value1 = int(value) new_labels.append(set([value1])) new_labels = mlb.transform(new_labels) labels.append(label) classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform') classifier.fit(train_data, labels) build_dir = getBuildDir() pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1) client.close()
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
class VectorizedData: """ Simple container that holds the input dataset in a sklearn-friendly form, with X, y numpy vectors. TODO: we ignore # of matches for each fbpath """ def __init__(self, data, Xdict=None, Ydict=None): fdict = [q_to_fdict(q) for q in data] lset = [q_to_lset(q) for q in data] if Xdict is None: self.Xdict = DictVectorizer() self.X = self.Xdict.fit_transform(fdict) else: self.Xdict = Xdict self.X = self.Xdict.transform(fdict) if Ydict is None: self.Ydict = MultiLabelBinarizer() self.Y = self.Ydict.fit_transform(lset) else: self.Ydict = Ydict # Filter out data with unknown labels, MultiLabelBinarizer() cannot # handle this known_lset = [set([label for label in ls if label in self.Ydict.classes_]) for ls in lset] lset_n = sum([len(ls) for ls in lset]) known_lset_n = sum([len(ls) for ls in known_lset]) if known_lset_n < lset_n: print('dropped %d out of %d labels (not in training set)' % (lset_n - known_lset_n, lset_n), file=sys.stderr) self.Y = self.Ydict.transform(known_lset) def cfier_score(self, cfier, scorer): """ Measure cfier performance on this dataset. scorer -> lambda cfier, X: cfier.predict_proba(X) (or decision_function when probabilities not predicted) """ skl_score = cfier.score(self.X.toarray(), self.Y) # XXX: Matched paths might/could be weighted by their nMatches too... # Measure prediction performance Ypred = cfier.predict(self.X.toarray()) n_q = float(np.size(self.Y, axis=0)) # number of questions where all correct paths have been recalled recall_all = np.sum(np.sum(self.Y, axis=1) == np.sum(Ypred * self.Y, axis=1)) / n_q # number of questions where at least one correct path has been recalled recall_any = np.sum((np.sum(self.Y, axis=1) != 0) == (np.sum(Ypred * self.Y, axis=1) != 0)) / n_q # number of *PATHS* (not q.) that were correct precision = np.sum((Ypred + self.Y) == 2) / float(np.sum(Ypred)) # Measure scoring performance Yscores = scorer(cfier, self.X.toarray()) # MRR of first correct path mrr = mrr_by_score(self.Y, Yscores) # number of questions where at least one correct path has been recalled in top N paths # TODO return {'sklScore': skl_score, 'qRecallAll': recall_all, 'qRecallAny': recall_any, 'pPrec': precision, 'qScoreMRR': mrr}
class ncClassifier(object): def __init__(self, emb_dict, clf): self.embeddings = emb_dict self.clf = TopKRanker(clf) # here clf is LR self.binarizer = MultiLabelBinarizer(sparse_output=True) def split_train_evaluate(self, X, Y, train_precent, seed=None): np.random.seed(seed=seed) state = np.random.get_state() training_size = int(train_precent * len(X)) shuffle_indices = np.random.permutation(np.arange(len(X))) X_train = [X[shuffle_indices[i]] for i in range(training_size)] Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] self.train(X_train, Y_train, Y) np.random.set_state(state) return self.evaluate(X_test, Y_test) def train(self, X, Y, Y_all): # to support multi-labels, fit means dict mapping {orig cat: binarized vec} self.binarizer.fit(Y_all) X_train = [self.embeddings[x] for x in X] # since we have use Y_all fitted, then we simply transform Y = self.binarizer.transform(Y) self.clf.fit(X_train, Y) def predict(self, X, top_k_list): X_ = np.asarray([self.embeddings[x] for x in X]) # see TopKRanker(OneVsRestClassifier) # the top k probs to be output... Y = self.clf.predict(X_, top_k_list=top_k_list) return Y def evaluate(self, X, Y): # multi-labels, diff len of labels of each node top_k_list = [len(l) for l in Y] Y_ = self.predict(X, top_k_list) # pred val of X_test i.e. Y_pred Y = self.binarizer.transform(Y) # true val i.e. Y_test averages = ["micro", "macro", "samples", "weighted"] results = {} for average in averages: results[average] = f1_score(Y, Y_, average=average) print(results) return results
class Classifier(object): def __init__(self, vectors, clf): self.embeddings = vectors self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True) def train(self, X, Y, Y_all): self.binarizer.fit(Y_all) X_train = [self.embeddings[x] for x in X] Y = self.binarizer.transform(Y) self.clf.fit(X_train, Y) def evaluate(self, X, Y): top_k_list = [len(l) for l in Y] Y_ = self.predict(X, top_k_list) Y = self.binarizer.transform(Y) averages = ["micro", "macro", "samples", "weighted"] results = {} for average in averages: results[average] = f1_score(Y, Y_, average=average) # print('Results, using embeddings of dimensionality', len(self.embeddings[X[0]])) # print('-------------------') print(results) return results # print('-------------------') def predict(self, X, top_k_list): X_ = numpy.asarray([self.embeddings[x] for x in X]) Y = self.clf.predict(X_, top_k_list=top_k_list) return Y def split_train_evaluate(self, X, Y, train_precent, seed=0): state = numpy.random.get_state() training_size = int(train_precent * len(X)) numpy.random.seed(seed) shuffle_indices = numpy.random.permutation(numpy.arange(len(X))) X_train = [X[shuffle_indices[i]] for i in range(training_size)] Y_train = [Y[shuffle_indices[i]] for i in range(training_size)] X_test = [X[shuffle_indices[i]] for i in range(training_size, len(X))] Y_test = [Y[shuffle_indices[i]] for i in range(training_size, len(X))] self.train(X_train, Y_train, Y) numpy.random.set_state(state) return self.evaluate(X_test, Y_test)
def load_data(train_set, multilabel=True): X_data = [] y_data = [] for c, (vector, target ) in enumerate(train_set): # load one vector into memory at a time X_data.append(vector) y_data.append(target) if c % 10000 == 0: print c print len(X_data), 'training examples' # Dictionary of classes. class_list = list(set([y for y_seq in y_data for y in y_seq])) nb_classes = len(class_list) print nb_classes, 'classes' class_dict = dict(zip(class_list, np.arange(len(class_list)))) with open('class_dict.json', 'w') as fp: json.dump(class_dict, fp) print 'Exported class dictionary' y_data_int = [] for y_seq in y_data: y_data_int.append([class_dict[y_seq[0]]]) # Tokenize and pad text. tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(X_data) X_data = tokenizer.texts_to_sequences(X_data) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) with open('word_index.json', 'w') as fp: json.dump(word_index, fp) print 'Exported word dictionary' X_data = pad_sequences(X_data, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post', dtype='float32') print('Shape of data tensor:', X_data.shape) if multilabel: mlb = MultiLabelBinarizer() mlb.fit([class_dict.values()]) y_data = mlb.transform(y_data_int) else: y_data = to_categorical(y_data_int) y_h_data = to_categorical(y_h_data_int) print('Shape of label tensor:', y_data.shape) X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.1, random_state=42) return X_train, X_val, y_train, y_val, nb_classes, word_index
def one_hot_encoding(data): type_list = data.unique().tolist() type_list = [[x] for x in type_list] mlb = MultiLabelBinarizer() mlb.fit(type_list) targets = data.values targets = [[a] for a in targets] feature = mlb.transform(targets) return feature
def binarize_labels(class_list, train, val, test): labelencoder = MultiLabelBinarizer(classes=class_list) train = labelencoder.fit_transform(train) val = labelencoder.fit_transform(val) test = labelencoder.transform(test) print( "\nTotal classes detected in each set: \n Train = {}, \n Val = {}, \n Test= {}" .format(len(train[0]), len(val[0]), len(test[0]))) return train, val, test
def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): print('fetching the datasets') cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat) test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force') def _group_by_lang(doc_list, langs): return { lang: [d for d in doc_list if d.lang == lang] for lang in langs } training_docs = _group_by_lang(training_docs, langs) test_docs = _group_by_lang(test_docs, langs) mlb = MultiLabelBinarizer() mlb.fit([label_names]) dataset = MultilingualDataset() data.dataset_name = 'JRC-Acquis-full' for lang in langs: analyzer = CountVectorizer(strip_accents='unicode', min_df=3, stop_words=stopwords.words( NLTK_LANGMAP[lang])).build_analyzer() Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) Xtr = [' '.join(analyzer(d)) for d in Xtr] Xte = [' '.join(analyzer(d)) for d in Xte] Ytr = mlb.transform(Ytr) Yte = mlb.transform(Yte) dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) dataset.save(outpath)
def apply_multilabel_binarizer(data_frame): print('### multi_label_binarizer ###') ################################################ classification: from text to sparse binary matrix [[0, 1, 0],[1, 0, 1]] temp_classification = data_frame.apply(lambda row : th.tokenize_complex_text_in_set(row['classification']), axis=1) df_to_list = temp_classification.tolist() mlb = MultiLabelBinarizer() mlb.fit(df_to_list) classes = list(mlb.classes_) return mlb.transform(df_to_list), classes, len(classes)
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): """ Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, since all of them lie on the same yuxtaposed feature space. :param dataset_name: the name of the dataset (str) :param langs: list of languages (str) :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) :param label_names: list of names of labels (str) :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes by language the processed wikipedia documents in their respective language-specific feature spaces """ multiling_dataset = MultilingualDataset() multiling_dataset.dataset_name = dataset_name mlb = MultiLabelBinarizer() mlb.fit([label_names]) multiling_dataset.set_labels(mlb.classes_) tr_data_stack = [] for lang in langs: print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) te_data, te_labels, te_ID = zip(*test_docs[lang]) if preprocess: tr_data = preprocess_documents(tr_data, lang) te_data = preprocess_documents(te_data, lang) tr_data_stack.extend(tr_data) multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) tfidf.fit(tr_data_stack) for lang in langs: print("\nweighting documents for language <%s>" % (lang)) (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] Xtr = tfidf.transform(tr_data) Xte = tfidf.transform(te_data) Ytr = mlb.transform(tr_labels) Yte = mlb.transform(te_labels) multiling_dataset.add(lang, Xtr, Ytr, Xte, Yte, tr_ID, te_ID) multiling_dataset.show_dimensions() return multiling_dataset
def loading_json(): script_start_time = time.time() print('%0.2f min: Start loading data'%((time.time() - script_start_time)/60)) train={} test={} validation={} with open('train.json') as json_data: train= json.load(json_data) with open('test.json') as json_data: test= json.load(json_data) with open('validation.json') as json_data: validation = json.load(json_data) print('Train No. of images: %d'%(len(train['images']))) print('Test No. of images: %d'%(len(test['images']))) print('Validation No. of images: %d'%(len(validation['images']))) # JSON TO PANDAS DATAFRAME # train data train_img_url=train['images'] train_img_url=pd.DataFrame(train_img_url) train_ann=train['annotations'] train_ann=pd.DataFrame(train_ann) train=pd.merge(train_img_url, train_ann, on='imageId', how='inner') # test data test=pd.DataFrame(test['images']) # Validation Data val_img_url=validation['images'] val_img_url=pd.DataFrame(val_img_url) val_ann=validation['annotations'] val_ann=pd.DataFrame(val_ann) validation=pd.merge(val_img_url, val_ann, on='imageId', how='inner') datas = {'Train': train, 'Test': test, 'Validation': validation} for data in datas.values(): data['imageId'] = data['imageId'].astype(np.uint32) print('%0.2f min: Finish loading data'%((time.time() - script_start_time)/60)) print('='*50) print('%0.2f min: Start converting label'%((time.time() - script_start_time)/60)) mlb = MultiLabelBinarizer() train_label = mlb.fit_transform(train['labelId']) validation_label = mlb.transform(validation['labelId']) dummy_label_col = list(mlb.classes_) print(dummy_label_col) print('%0.2f min: Finish converting label'%((time.time() - script_start_time)/60)) for data in [validation_label, train_label, test]: print(data.shape) return train, test, validation
def binarize(y_train, y_val, y_test): # Fit the multi-label binarizer on the training set print("Labels:") mlb = MultiLabelBinarizer() mlb.fit(y_train) # Loop over all labels and show them N_LABELS = len(mlb.classes_) for (i, label) in enumerate(mlb.classes_): print("{}. {}".format(i, label)) # transform the targets of the training and test sets y_train_bin = mlb.transform(y_train) y_val_bin = mlb.transform(y_val) y_test_bin = mlb.transform(y_test) return (y_train_bin, y_val_bin, y_test_bin, N_LABELS)
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' langs = list(tr_ids.keys()) print('fetching the datasets') cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat) test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force') def filter_by_id(doclist, ids): ids_set = frozenset(itertools.chain.from_iterable(ids.values())) return [x for x in doclist if (x.parallel_id + '__' + x.id) in ids_set] training_docs = filter_by_id(training_docs, tr_ids) test_docs = filter_by_id(test_docs, te_ids) print('jrc: {} train, {} test, {} categories'.format( len(training_docs), len(test_docs), len(label_names))) mlb = MultiLabelBinarizer() mlb.fit([label_names]) dataset = MultilingualDataset() for lang in langs: analyzer = CountVectorizer(strip_accents='unicode', min_df=3, stop_words=stopwords.words( NLTK_LANGMAP[lang])).build_analyzer() Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs if d.lang == lang]) Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs if d.lang == lang]) Xtr = [' '.join(analyzer(d)) for d in Xtr] Xte = [' '.join(analyzer(d)) for d in Xte] Ytr = mlb.transform(Ytr) Yte = mlb.transform(Yte) dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) dataset.save(outpath)
def print_multilabel_results(resfile, outdir, args=None, n_strats=1): """ Function that calculates performance statistics and prints them to a result file for multilabel tests """ #logging.info('Writing scores to %s', str(outdir)) with open(resfile, 'rb') as f: results = pickle.load(f) # # Now do the evaluation! # #results = [ # # 0 => ([x, y, z], <-- true # # [x, y, k]) <-- pred # #] y_trues = [[] for _ in range(n_strats)] y_preds = [[] for _ in range(n_strats)] for idx, result in enumerate(results): y_trues[idx % n_strats] += result[0] y_preds[idx % n_strats] += result[1] for strat, (y_true, y_pred) in enumerate(zip(y_trues, y_preds)): bnz = MultiLabelBinarizer() bnz.fit(y_true) all_tags = copy.deepcopy(y_true) for preds in y_pred: for label in preds: if label not in bnz.classes_: all_tags.append([label]) bnz.fit(all_tags) y_true = bnz.transform(y_true) y_pred = bnz.transform(y_pred) labels = bnz.classes_ report = metrics.classification_report(y_true, y_pred, target_names=labels) f1w = metrics.f1_score(y_true, y_pred, average='weighted') f1i = metrics.f1_score(y_true, y_pred, average='micro') f1a = metrics.f1_score(y_true, y_pred, average='macro') pw = metrics.precision_score(y_true, y_pred, average='weighted') pi = metrics.precision_score(y_true, y_pred, average='micro') pa = metrics.precision_score(y_true, y_pred, average='macro') rw = metrics.recall_score(y_true, y_pred, average='weighted') ri = metrics.recall_score(y_true, y_pred, average='micro') ra = metrics.recall_score(y_true, y_pred, average='macro') file_header = ( "# MULTILABEL EXPERIMENT REPORT\n" + time.strftime("# Generated %c\n#\n") + ('#\n# Args: {}\n#\n'.format(args) if args else '') + "# 3 FOLD CROSS VALIDATION WITH {} CHANGESETS\n".format(len(y_true)) + "# F1 SCORE : {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n".format(f1w, f1i, f1a) + "# PRECISION: {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n".format(pw, pi, pa) + "# RECALL : {:.3f} weighted, {:.3f} micro-avg'd, {:.3f} macro-avg'd\n#\n".format(rw, ri, ra) + "# {:-^55}\n#".format("CLASSIFICATION REPORT") + report.replace('\n', "\n#") ) os.makedirs(str(outdir), exist_ok=True) savetxt("{}/{}.txt".format(outdir, strat), np.array([]), fmt='%d', header=file_header, delimiter=',', comments='')
def data_loader(params, is_rebuild_dataset=False): if os.path.exists(config.train_x_path) and not is_rebuild_dataset: x_train = np.load(config.train_x_path) x_test = np.load(config.test_x_path) y_train = np.load(config.train_y_path) y_test = np.load(config.test_y_path) with open(config.vocab_save_path, 'r', encoding='utf-8') as f: vocab = {} for content in f.readlines(): k, v = content.strip().split('\t') vocab[k] = int(v) label_df = pd.read_csv(config.data_label_path) # 多标签编码 mlb = MultiLabelBinarizer() mlb.fit([label_df['label']]) return x_train, x_test, y_train, y_test, vocab, mlb df = pd.read_csv(config.data_path, header=None).rename(columns={ 0: 'label', 1: 'content' }) df = parallelize(df, proc) text_preprocesser = tf.keras.preprocessing.text.Tokenizer( num_words=params['vocab_size'], oov_token="<UNK>") text_preprocesser.fit_on_texts(df['content']) vocab = text_preprocesser.word_index with open(config.vocab_save_path, 'w', encoding='utf-8') as f: for k, v in vocab.items(): f.write(f'{k}\t{str(v)}\n') x = text_preprocesser.texts_to_sequences(df['content']) x = tf.keras.preprocessing.sequence.pad_sequences( x, maxlen=params['padding_size'], padding='post', truncating='post') # label_df = pd.read_csv(config.data_label_path) mlb = MultiLabelBinarizer() df['label'] = df['label'].apply(lambda x: x.split()) mlb.fit(df['label']) y = mlb.transform(df['label']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) np.save(config.train_x_path, x_train) np.save(config.test_x_path, x_test) np.save(config.train_y_path, y_train) np.save(config.test_y_path, y_test) return x_train, x_test, y_train, y_test, vocab, mlb
def main(): """ :return: """ # Gets or creates a logger logging.config.fileConfig('logging.conf') logger = logging.getLogger(__name__) logger.info("********** NEW RUN **********") # *******Change train_on_dataset to True for small dataset ******** train_on_full_dataset = False is_submission = False if train_on_full_dataset: train_data_dir = "Data/train_data" test_data_dir = "Data/reuters_test_data" else: train_data_dir = "Data/reuters_train_data" test_data_dir = "Data/reuters_test_data" logger.info("Initiating training with data from '%s' directory", train_data_dir) knn_model = model.Model(train_data_dir) logger.info( "Predicting testing with data with countries from '%s' directory", test_data_dir) if is_submission: predictions = knn_model.predict(test_data_dir, is_submission=True) else: predictions, reference = knn_model.predict(test_data_dir) logger.info("Prediction complete") pickleHelper.save_to_pickle("predictions", predictions) # path_to_predictions = "Pickles/predictions-2019-08-15-1027.p" # try: # returned_predictions = pickleHelper.retrieve_from_pickle(path_to_predictions, "predictions") # except FileNotFoundError: # returned_predictions = knn_model.predict(test_data_dir) # print(predictions) # print(reference) mlb = MultiLabelBinarizer() r = mlb.fit_transform(reference) p = mlb.transform(predictions) try: score = sklearn.metrics.f1_score(y_true=r, y_pred=p, average='macro') print(score) logger.info("The f1 score is: %s", score) except ValueError as ex: logger.error("result value is invalid: " + str(ex))
def calculate_multilabel_metrics_text(path, model, metric='Hamming', threshold=0.1, verbose=False): mlb = MultiLabelBinarizer() mlb.fit([model.labels]) with open(path, 'r') as test_data: ground_truth = [] predicted = [] probabilities = [] for line in test_data.readlines(): parts = line.split(' ') label_count = len(line.split('__label__')) - 1 labels = [parts[i] for i in range(1, label_count * 2 + 1, 2)] ground_truth.append(labels) text = ' '.join(parts[label_count * 2:]) predicted_labels, probability = model.predict( text[:-1], k=-1) # , threshold=threshold) ordered_probabilities = np.zeros(len(mlb.classes_)) for i, label in enumerate(predicted_labels): ordered_probabilities[np.where( mlb.classes_ == label)] = probability[i] predicted_labels = get_best_labels(predicted_labels, probability) predicted.append(list(predicted_labels)) probabilities.append(ordered_probabilities) if verbose: print(labels, "###", predicted_labels) predicted = mlb.transform(predicted) ground_truth = mlb.transform(ground_truth) if metric == 'Hamming': print('Hamming loss: {0}'.format( hamming_loss(ground_truth, predicted))) print('Hamming_score: {0}'.format( hamming_score(ground_truth, predicted))) return hamming_score(ground_truth, predicted) elif metric == "MAP": return MAP(ground_truth, probabilities) elif metric == "Report": return classification_report(ground_truth, predicted, target_names=mlb.classes_)
def fitClassifier(self, train_profiles, train_labels): clf = LogisticRegression(C=1.0, solver='lbfgs', max_iter=10000) clf = OneVsRestClassifier(clf) mlb = MultiLabelBinarizer() mlb = mlb.fit(train_labels) train_labels = mlb.transform(train_labels) clf.fit(train_profiles, train_labels) return clf, mlb
def one_hot_encoded_multiclass(df, feature, default_value_name): mlb = MultiLabelBinarizer() cat = df[feature].str.split(',') integer_encoded = mlb.fit_transform(cat) df[feature] = integer_encoded default_value = mlb.transform(np.array(default_value_name).reshape(-1, 1)) #results_union = set().union(*cat) return df, default_value[0]
def add_pos_count_cols(df, tokenized_col): df['temp'] = df[tokenized_col].apply(lambda x: [ent.pos_ for ent in x]) mlb = MultiLabelBinarizer() mlb.fit(df['temp']) df = df.join( pd.DataFrame(mlb.transform(df['temp']), columns=mlb.classes_, index=df.index)) del df['temp'] return df
class MyMultiLabelBinarizer(TransformerMixin): def __init__(self, *args, **kwargs): self.encoder = MultiLabelBinarizer(*args, **kwargs) def fit(self, x, y=0): self.encoder.fit(x) return self def transform(self, x, y=0): return self.encoder.transform(x)
def __init__( self, inter_filePath="inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format( "vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train ) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test ) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array( self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array( self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df=1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform( correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def binarize_labels(pred_labels, true_labels): srcids = list(pred_labels.keys()) tot_labels = [ list(labels) for labels in list(pred_labels.values()) + list(true_labels.values()) ] mlb = MultiLabelBinarizer().fit(tot_labels) pred_mat = mlb.transform(pred_labels.values()) true_mat = mlb.transform(true_labels.values()) return pred_mat, true_mat
def binarize_labels(true_labels, pred_labels, excluding_labels=[]): excluding_labels = ['building-ebu3b'] srcids = list(pred_labels.keys()) tot_labels = [[label for label in labels if label not in excluding_labels] for labels in list(pred_labels.values()) + list(true_labels.values())] mlb = MultiLabelBinarizer().fit(tot_labels) pred_mat = mlb.transform(pred_labels.values()) true_mat = mlb.transform(true_labels.values()) return true_mat, pred_mat
class EncodeMultilabel(object): mlb: MultiLabelBinarizer def __init__(self, label_names: list[str]): self.mlb = MultiLabelBinarizer() self.mlb.fit([label_names]) def __call__(self, sample: tuple[int, list[str]]) -> tuple[int, np.ndarray]: track_id, label = sample return track_id, self.mlb.transform([label])[0]
def main(argv): options = argparser().parse_args(argv[1:]) train_texts, train_labels = load_data(options.train, options.input_format, options.multiclass) dev_texts, dev_labels = load_data(options.dev, options.input_format, options.multiclass) num_train_examples = len(train_texts) label_encoder = MultiLabelBinarizer() label_encoder.fit(train_labels) train_Y = label_encoder.transform(train_labels) dev_Y = label_encoder.transform(dev_labels) num_labels = len(label_encoder.classes_) classifier, tokenizer, optimizer, config = prepare_classifier( num_train_examples, num_labels, options) config.multiclass = options.multiclass tokenize = make_tokenization_function(tokenizer, options.seq_len) train_X = tokenize(train_texts) dev_X = tokenize(dev_texts) history = classifier.fit( train_X, train_Y, epochs=options.epochs, batch_size=options.batch_size, validation_data=(dev_X, dev_Y), ) metrics_values = classifier.evaluate(dev_X, dev_Y, batch_size=options.batch_size) for name, value in zip(classifier.metrics_names, metrics_values): print(f'{name}\t{value}') if options.save_model is not None: save_trained_model(options.save_model, classifier, tokenizer, label_encoder.classes_, config) return 0
def fungo_test_wrapper(name='cellcycle_FUN'): X_train, X_test, train_ids, test_ids, id2doc, nodes = read_fungo(name) X_train, X_test = np.array(X_train), np.array(X_test) id2doc_train = id2doc args = conf() # id2doc_train = filter_ancestors(id2doc, nodes) tree = Tree(args, train_ids, test_ids, id2doc=id2doc_train, id2doc_a=id2doc, nodes=nodes, rootname='Top') mlb = MultiLabelBinarizer(classes=tree.class_idx) Y_train = mlb.fit_transform([tree.id2doc_ancestors[docid]['class_idx'] for docid in train_ids]) Y_test = mlb.transform([tree.id2doc_ancestors[docid]['class_idx'] for docid in test_ids]) return X_train, Y_train, X_test, Y_test
def classification(X, y, testSize=0.2): clf = OneVsRestClassifier(LogisticRegression(max_iter=10000)) binarizer = MultiLabelBinarizer() trainX, testX, trainY, testY = train_test_split(X, y, test_size=testSize, shuffle=True) binarizer.fit(y) clf.fit(trainX, binarizer.transform(trainY)) topKList = [len(i) for i in testY] probs = np.asarray(clf.predict_proba(np.asarray(testX))) for i, k in enumerate(topKList): lables = clf.classes_[probs[i, :].argsort()[-k:]].tolist() probs[i, :] = 0 probs[i, lables] = 1 testY = binarizer.transform(testY) return { 'micro': f1_score(testY, probs, average='micro'), 'macro': f1_score(testY, probs, average='macro') }
def computeF1(cleanedFigure, predictions): support = [] f1s = [] for true, pred in zip(cleanedFigure, predictions): support.append(len(true)) binarizer = MultiLabelBinarizer().fit(true + pred) y_true = binarizer.transform(true) y_pred = binarizer.transform(pred) f1s.append(f1_score(y_true, y_pred, average='micro')) F1 = np.average(f1s, weights=support) return F1
def main(): collection_stats() print("Staring classifier ..") X_train = list() X_test = list() y_train = list() y_test = list() print("Reading training and testing data ..") for doc_id in reuters.fileids(): if doc_id.startswith("train"): X_train.append(reuters.raw(doc_id)) y_train.append(reuters.categories(doc_id)) else: X_test.append(reuters.raw(doc_id)) y_test.append(reuters.categories(doc_id)) X_train = numpy.array(X_train) y_train = numpy.array(y_train) X_test = numpy.array(X_test) y_test = numpy.array(y_test) binarizer = MultiLabelBinarizer(classes=reuters.categories()) classifier = Pipeline([ ('vectorizer', TfidfVectorizer(tokenizer=tokenize, min_df=0, max_df=0.90, max_features=3000, use_idf=True, sublinear_tf=True)), # ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LogisticRegression())) ]) print("Training classifier ..") classifier.fit(X_train, binarizer.fit_transform(y_train)) print("Testing classifier ..") res = classifier.predict(X_test) hard_precision = classifier.score(X_test, binarizer.transform(y_test)) precision = average_precision_score(res, binarizer.fit_transform(y_test), average=None) recall = recall_score(res, binarizer.fit_transform(y_test), average=None) f1score = f1_score(res, binarizer.fit_transform(y_test), average=None) print("Hard precision: " + str(hard_precision)) log_results(reuters.categories(), precision, recall, f1score)
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 # using sequence of sequences is deprecated, but still tested make_ml = ignore_warnings(make_multilabel_classification) _, y_true = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) # Be sure to have at least one empty label y_true += ([], ) y_pred += ([], ) n_samples += 1 lb = MultiLabelBinarizer().fit([range(n_classes)]) y_true_binary_indicator = lb.transform(y_true) y_pred_binary_indicator = lb.transform(y_pred) for name in METRICS_WITH_NORMALIZE_OPTION: metrics = ALL_METRICS[name] # List of list of labels measure = assert_warns(DeprecationWarning, metrics, y_true, y_pred, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal( ignore_warnings(metrics)(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name) # Indicator matrix format measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal(metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name)
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): """ Builds the document-by-term weighted matrices for each language. Representations are independent of each other, i.e., each language-specific matrix lies in a dedicate feature space. :param dataset_name: the name of the dataset (str) :param langs: list of languages (str) :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) :param label_names: list of names of labels (str) :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes by language the processed wikipedia documents in their respective language-specific feature spaces """ mlb = MultiLabelBinarizer() mlb.fit([label_names]) lW = {} multilingual_dataset = MultilingualDataset() multilingual_dataset.dataset_name = dataset_name multilingual_dataset.set_labels(mlb.classes_) for lang in langs: print("\nprocessing %d training, %d test, %d wiki for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) tr_data, tr_labels, IDtr = zip(*training_docs[lang]) te_data, te_labels, IDte = zip(*test_docs[lang]) if preprocess: tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, tokenizer=NLTKStemTokenizer(lang, verbose=True), stop_words=stopwords.words(NLTK_LANGMAP[lang])) else: tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) Xtr = tfidf.fit_transform(tr_data) Xte = tfidf.transform(te_data) if wiki_docs: lW[lang] = tfidf.transform(wiki_docs[lang]) Ytr = mlb.transform(tr_labels) Yte = mlb.transform(te_labels) multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) multilingual_dataset.show_dimensions() multilingual_dataset.show_category_prevalences() if wiki_docs: return multilingual_dataset, lW else: return multilingual_dataset
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20,4,tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf,processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int( len(sets["test"]) *(top_percentage/100.0) ) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [ predicted_probs[j,i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]]= z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
def get_data(train_file, test_file): X_train, Y_train = load_data(train_file) X_train = [ln.split('\t')[1] for ln in X_train] X_test, Y_test = load_data(test_file) X_test = [ln.split('\t')[1] for ln in X_test] mlb = MultiLabelBinarizer() Y_train = [set(s.split('_')) - {'None'} for s in Y_train] Y_test = [set(s.split('_')) - {'None'} for s in Y_test] Y_train = mlb.fit_transform(Y_train) Y_test = mlb.transform(Y_test) return X_train, X_test, Y_train, Y_test, mlb.classes_
def test_multilabelbinarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.MultiLabelBinarizer # with sklearn.preprocessing.MultiLabelBinarizer multilabelbinarizerr = MultiLabelBinarizerR() multilabelbinarizerr.fit(np.concatenate(trajs)) multilabelbinarizer = MultiLabelBinarizer() multilabelbinarizer.fit(trajs) y_ref1 = multilabelbinarizerr.transform(trajs[0]) y1 = multilabelbinarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def test_normalize_option_multilabel_classification(): # Test in the multilabel case n_classes = 4 n_samples = 100 # using sequence of sequences is deprecated, but still tested make_ml = ignore_warnings(make_multilabel_classification) _, y_true = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) # Be sure to have at least one empty label y_true += ([], ) y_pred += ([], ) n_samples += 1 lb = MultiLabelBinarizer().fit([range(n_classes)]) y_true_binary_indicator = lb.transform(y_true) y_pred_binary_indicator = lb.transform(y_pred) for name in METRICS_WITH_NORMALIZE_OPTION: metrics = ALL_METRICS[name] # List of list of labels measure = assert_warns(DeprecationWarning, metrics, y_true, y_pred, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal(ignore_warnings(metrics)(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name) # Indicator matrix format measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True) assert_greater(measure, 0, msg="We failed to test correctly the normalize option") assert_almost_equal(metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name)
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
class GlobalLabelTransformer(BaseTaskTransformer): def __init__(self, namespace, name, labels=None): '''Initialize a global label transformer Parameters ---------- jam : jams.JAMS The JAMS object container ''' super(GlobalLabelTransformer, self).__init__(namespace, 0) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.name = name def transform(self, jam): ann = self.find_annotation(jam) intervals = np.asarray([[0, 1]]) values = [None] mask = False if ann: values = list(ann.data.value) intervals = np.tile(intervals, [len(values), 1]) mask = True # Suppress all intervals not in the encoder tags = [v for v in values if v in self._classes] if len(tags): target = self.encoder.transform([tags]).max(axis=0) else: target = np.zeros(len(self._classes), dtype=np.int) return {'output_{:s}'.format(self.name): target, 'mask_{:s}'.format(self.name): mask}
def run_classifierAccuracy(terms, labels, testSentences, testLabels): labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \ "Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] import numpy as np class_terms_matrix, tfidf = tf_idf_fit_transform(terms) sentence_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", sentence_matrix.shape) # print("Original order of lables:") # print(labels) from sklearn.metrics.pairwise import cosine_similarity similarity_matrix = cosine_similarity(sentence_matrix, class_terms_matrix) similarity_matrix = binary_rel(similarity_matrix) predictions = [] for i in range(len(testSentences)): predictions.append([labels[x] for x in range(similarity_matrix.shape[1]) if similarity_matrix[i][x]==1]) from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=labels) # mlb = MultiLabelBinarizer() test_label_matrix = mlb.fit_transform(testLabels) predictions = mlb.transform(predictions) print("Shape of label matrix : ", test_label_matrix.shape) print("Labels : ", mlb.classes_) from sklearn.metrics import f1_score, precision_score, recall_score print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None)) print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None)) print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
# # where images are a 1x784 flatt array and labels are an integer between 0 and 9. # mnist = input_data.read_data_sets("MNIST_data/") x_train = mnist.train.images x_test = mnist.test.images y_train = mnist.train.labels y_train = [[i] for i in y_train] y_test = mnist.test.labels y_test = [[i] for i in y_test] # One-hot encode labels one_hot = MultiLabelBinarizer() y_train = one_hot.fit_transform(y_train) y_test = one_hot.transform(y_test) # Example 1: Fully connected neural network model # We start with a 'sequential' model type (connecting layers together) model = keras.Sequential() # Adds a densely-connected layer with 32 units to the model, followed by an ReLU activation. model.add(keras.layers.Dense(32, activation='relu')) # Adds a densely-connected layer with 16 units to the model, followed by an ReLU activation. model.add(keras.layers.Dense(16, activation='relu')) # Add a softmax layer with 10 output units: model.add(keras.layers.Dense(10, activation='softmax')) # Train the model: model.compile(optimizer=tf.train.AdamOptimizer(0.001), loss='categorical_crossentropy', metrics=['accuracy'])
def test_multilabel_representation_invariance(): # Generate some data n_classes = 4 n_samples = 50 # using sequence of sequences is deprecated, but still tested make_ml = ignore_warnings(make_multilabel_classification) _, y1 = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y2 = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) # Be sure to have at least one empty label y1 += ([], ) y2 += ([], ) # NOTE: The "sorted" trick is necessary to shuffle labels, because it # allows to return the shuffled tuple. rng = check_random_state(42) shuffled = lambda x: sorted(x, key=lambda *args: rng.rand()) y1_shuffle = [shuffled(x) for x in y1] y2_shuffle = [shuffled(x) for x in y2] # Let's have redundant labels y2_redundant = [x * rng.randint(1, 4) for x in y2] # Binary indicator matrix format lb = MultiLabelBinarizer().fit([range(n_classes)]) y1_binary_indicator = lb.transform(y1) y2_binary_indicator = lb.transform(y2) y1_sparse_indicator = sp.coo_matrix(y1_binary_indicator) y2_sparse_indicator = sp.coo_matrix(y2_binary_indicator) y1_shuffle_binary_indicator = lb.transform(y1_shuffle) y2_shuffle_binary_indicator = lb.transform(y2_shuffle) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] # XXX cruel hack to work with partial functions if isinstance(metric, partial): metric.__module__ = 'tmp' metric.__name__ = name measure = metric(y1_binary_indicator, y2_binary_indicator) # Check representation invariance assert_almost_equal(metric(y1_sparse_indicator, y2_sparse_indicator), measure, err_msg="%s failed representation invariance " "between dense and sparse indicator " "formats." % name) # Check shuffling invariance with dense binary indicator matrix assert_almost_equal(metric(y1_shuffle_binary_indicator, y2_shuffle_binary_indicator), measure, err_msg="%s failed shuffling invariance " " with dense binary indicator format." % name) # Check deprecation warnings related to sequence of sequences deprecated_metric = partial(assert_warns, DeprecationWarning, metric) # Check representation invariance assert_almost_equal(deprecated_metric(y1, y2), measure, err_msg="%s failed representation invariance " "between list of list of labels " "format and dense binary indicator " "format." % name) # Check invariance with redundant labels with list of labels assert_almost_equal(deprecated_metric(y1, y2_redundant), measure, err_msg="%s failed rendundant label invariance" % name) # Check shuffling invariance with list of labels assert_almost_equal(deprecated_metric(y1_shuffle, y2_shuffle), measure, err_msg="%s failed shuffling invariance " "with list of list of labels format." % name) # Check raises error with mix input representation assert_raises(ValueError, deprecated_metric, y1, y2_binary_indicator) assert_raises(ValueError, deprecated_metric, y1_binary_indicator, y2)
print("done in %fs" % (duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print("") print("Extracting features from the test data using the vectorizer") t0 = time() X_test = vectorizer.transform(questions_test) duration = time() - t0 print("done in %fs" % (duration)) print("n_samples: %d, n_features: %d" % X_test.shape) print("") feature_names = vectorizer.get_feature_names() if feature_names: feature_names = np.asarray(feature_names) y_train = mlb.transform(tags_train) y_test = mlb.transform(tags_test) tags = list(mlb.classes_) print("n_unique_tags = %d" % len(tags)) print("") # chi2 can be used to reduce the number of features to the top k most relevant # if opts.select_chi2: # print("Extracting %d best features by a chi-squared test" % # opts.select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k=opts.select_chi2) # X_train = ch2.fit_transform(X_train, y_train) # X_test = ch2.transform(X_test)
# Factorize building_id, display_address, manager_id, street_address for col in ('building_id', 'display_address', 'manager_id', 'street_address'): X_train, X_test = factorize(X_train, X_test, col) # Create binarized features fmt = lambda feat: [s.replace("\u00a0", "").strip().lower().replace(" ", "_") for s in feat] # format features X_train["features"] = X_train["features"].apply(fmt) X_test["features"] = X_test["features"].apply(fmt) features = [f for f_list in list(X_train["features"]) + list(X_test["features"]) for f in f_list] ps = pd.Series(features) grouped = ps.groupby(ps).agg(len) features = grouped[grouped >= 10].index.sort_values().values # limit to features with >=10 observations mlb = MultiLabelBinarizer().fit([features]) columns = ['feature_' + s for s in mlb.classes_] flt = lambda l: [i for i in l if i in mlb.classes_] # filter out features not present in MultiLabelBinarizer X_train = X_train.join(pd.DataFrame(data=mlb.transform(X_train["features"].apply(flt)), columns=columns, index=X_train.index)) X_test = X_test.join(pd.DataFrame(data=mlb.transform(X_test["features"].apply(flt)), columns=columns, index=X_test.index)) # Save ##X_train = X_train.sort_index(axis=1).sort_values(by="listing_id") ##X_test = X_test.sort_index(axis=1).sort_values(by="listing_id") ##columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "description", "features", "created"] ##X_train.drop([c for c in X_train.columns if c in columns_to_drop], axis=1).\ ## to_csv("../data_prepared/train_ManStatsListFCFQ_leak.csv", index=False, encoding='utf-8')
try: from sklearn.preprocessing import MultiLabelBinarizer lb = MultiLabelBinarizer() except ImportError, e: from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() TRIM_SAMPLES = len(tags) # / 10 tags = tags[:TRIM_SAMPLES] learn_data = learn_data[:TRIM_SAMPLES] lb.fit(tags) labels = lb.transform(tags) print "using\t", TRIM_SAMPLES, "samples" print "\t", len(keywords), "keywords" print "\t", len(lb.classes_), "tags" metadata = learn_data.sum(axis=1) print "\t", metadata.mean(), "avg words in document" print "\t", metadata.max(), "biggest document" print "\t", metadata.min(), "smallest document" # plt.figure(figsize=(8, 6)) # plot_subfigure(learn_data, labels, 1, "With unlabeled samples + CCA", "cca") # plot_subfigure(learn_data, labels, 2, "With unlabeled samples + PCA", "pca") # plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'] category = train_frame['Category'] mapping = {clazz: num for (num, clazz) in enumerate(classes)} most_freq_class = Counter(category).most_common()[0][0] predicted = category.apply(lambda cat: mapping[most_freq_class]) expected = category.apply(lambda cat: mapping[cat]) mlb = MultiLabelBinarizer() expected_b = mlb.fit_transform(to_singleton(expected)) predicted_b = mlb.transform(to_singleton(predicted)) for (clazz, count) in Counter(category).most_common(): print("{}\t{}".format(clazz, count)) # todo: use validation.py print("Accuracy on training: {}".format(accuracy_score(expected_b, predicted_b))) print("Log los on training: {}".format(log_loss(expected_b, predicted_b))) test_prediction = np.full((submission_size, len(predicted_b[0])), predicted_b[0]) create_submission(test_prediction, 'baseline_sub.csv')
else: continue idx_test_cur = [ind for ind in range(0,len(y_predict_cate)) if cate_cur in y_predict_cate[ind]] idx_train_cur = [ind for ind in range(0,len(y_train_cate)) if cate_cur in y_train_cate[ind]] x_train_cur = x_train[idx_train_cur] x_test_cur = x_test[idx_test_cur] y_train_code_cur = [] y_test_code_cur = [] for category_predict_tuple in y_test_code[idx_test_cur]: codes = [] if len(category_predict_tuple) == 0: codes.append(defaultcode[cate_cur]) else: codes.extend([v for v in category_predict_tuple if v.startswith(cate_cur)]) y_test_code_cur.append(codes) y_test_code_cur_map = ml.transform(y_test_code_cur) for y_train_code_tuple in y_train_code[idx_train_cur]: codes = [] if len(y_train_code_tuple) == 0: codes.append(defaultcode[cate_cur]) else: codes.extend([v for v in y_train_code_tuple if v.startswith(cate_cur)]) y_train_code_cur.append(codes) y_train_code_cur_map = ml.transform(y_train_code_cur) model_code = DecisionTreeClassifier() model_code.fit(x_train_cur,y_train_code_cur_map) y_predict_code_map = model_code.predict(x_test_cur) y_predict_code_map_prob = model_code.predict_proba(x_test_cur) y_text_new,y_predict_new = transfer_multilabel(y_predict_code_map,y_test_code_cur_map,ml,y_predict_code_map_prob,cate_cur) report_y_predict.extend(y_predict_new)
# We want to convert the labels into vectors. For example, if we have: # keywords = [ # ['solar', 'physics', 'astronomy'], # ['physics', 'lasers'], # ['astronomy'] # ] # this would become: # keywords_binarised = [ # [1, 1, 1, 0], # [0, 1, 0, 1], # [0, 0, 1, 0] # ] mlb = MultiLabelBinarizer() mlb.fit(keywords) keywords_vector = mlb.transform(keywords) # We generate a transform from words -> vector space. This is very similar # to the above conversion of the keywords. In this scenario, the entire # corpus from our training set is converted into an id -> word sparse- # matrix. bow_transform = CountVectorizer(analyzer=text_to_vector).fit(' '.join(text)) # We transform our corpus into the unique vector space bow_vector = bow_transform.transform(text) # We convert the vector into a term frequency - inverse document frequency # Term frequencey: f_t (number of times in a document term t exists) # Inverse document frequency: log(N/n_t) (number of documents divided by # the number of documents that # contain term t)
mlb.fit(train_ids['business_id'].tolist()) # X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32) # X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32) return train_ids,mlb def load_train(train_list): return(np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_list]).astype(np.float32)/255.0) train_ids,mlb=load_data() labels=pd.read_csv("train.csv") labels=labels[pd.isnull(labels['labels'])==False].reset_index(drop=True) labels['assignment']=np.random.uniform(size=(labels.shape[0],1)) MLB=MultiLabelBinarizer() train_ids=train_ids.merge(labels[['business_id','assignment']],on='business_id',how='left') MLB.fit(train_ids['labels'].tolist()) labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) BETA=MLB.transform(labels.sort('business_id')['labels']) val_ids=train_ids[train_ids['assignment']>=.9].reset_index(drop=True) val_Y=MLB.transform(val_ids['labels']) train_ids=train_ids[train_ids['assignment']<.9].reset_index(drop=True) Y_test=mlb.transform(val_ids['business_id'].tolist()) print Y_test.shape np.random.seed(42) #train_ids=train_ids.sort('business_id').reset_index(drop=True) train_ids.reindex(np.random.permutation(train_ids.index)) val_ids.reindex(np.random.permutation(val_ids.index)) validate=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()[0:10000]]).astype(np.float32)/255.0 datagen = ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True, rotation_range=20,
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels): all_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \ 'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \ 'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \ 'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \ 'terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \ 'aviation_hazard', 'train_collision', 'industrial_accident', \ 'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \ 'epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] disaster_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \ 'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \ 'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \ 'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \ 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] health_labels = ['epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] conflict_labels = ['terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \ 'aviation_hazard', 'train_collision', 'industrial_accident', \ 'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \ 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] import numpy as np curr_labels = set(all_labels) trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels] curr_labels = [] for l in trainLabels: curr_labels.extend(l) curr_labels = set(curr_labels) testLabels = [list(set(l).intersection(curr_labels))for l in testLabels] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=list(curr_labels)) train_label_matrix = mlb.fit(trainLabels) print("Labels : ", mlb.classes_) train_label_matrix = mlb.transform(trainLabels) test_label_matrix = mlb.transform(testLabels) print("Shape of label matrix : ", test_label_matrix.shape) train_matrix, tfidf = tf_idf_fit_transform(trainSentences) test_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", test_matrix.shape) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier estimator = LinearSVC() # estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1) classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, train_label_matrix) predictions = classifier.predict(test_matrix) from sklearn.metrics import f1_score, precision_score, recall_score print("All-Precision", precision_score(test_label_matrix, predictions, average=None)) print("All-Recall", recall_score(test_label_matrix, predictions, average=None)) print("All-F1", f1_score(test_label_matrix, predictions, average=None)) print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
X_train=loadimages(tphotos['photo_id']) for epoch in xrange(0,epochs): tphotos=train.groupby('business_id').apply(extract_images).reset_index() tphotos.reindex(np.random.permutation(tphotos.index)) tphotos.columns=['business_id','photo_id'] tphotos=tphotos.merge(labels,on='business_id',how='left') tphotos['labels']=tphotos['labels'].map(lambda x:[int(i) for i in x.split(" ")]) tstphotos=test.groupby('business_id').apply(extract_images).reset_index() tstphotos.reindex(np.random.permutation(tstphotos.index)) tstphotos.columns=['business_id','photo_id'] tstphotos=tstphotos.merge(labels,on='business_id',how='left') tstphotos['labels']=tstphotos['labels'].map(lambda x:[int(i) for i in x.split(" ")]) #Y_train=mlb.transform(tphotos['labels']) if epoch==0: Y_train=loadimages(tstphotos['photo_id']) Y_test=mlb.transform(tstphotos['labels']) X_test=mlb.transform(tphotos['labels']) X_train=loadimages(tphotos['photo_id']) # X_train=np.random.uniform(size=(X_test.shape[0],3,224,224)) pdb.set_trace() inputkeys={"input"+str(i):X_train[:,i,:,:,:] for i in xrange(0,n_images)} inputkeys['output1']=X_test graph.fit(inputkeys,nb_epoch=1,batch_size=16) # graph.fit({"input1":X_train,"input2":X_train,'output1':X_test},nb_epoch=2) # model.fit(X_train,X_test,batch_size=128,nb_epoch=1,verbose=0) inputkeys={"input"+str(i):Y_train[:,i,:,:,:] for i in xrange(0,n_images)} prob=graph.predict(inputkeys)['output1'] pred=np.round(prob) # probs=graph.predict_proba({"input1":Y_train[:,0,:,:,:],"input2":Y_train[:,1,:,:,:]}) # print prob.mean(axis=0) # print prob.max(axis=0)
for chunk in reader: chunk.dropna(inplace=True) chunks.append(chunk) test = pd.concat(chunks) del(chunks) # Split the tags by spaces train_labels = train['Tags'].map(lambda x: x.split()) test_labels = test['Tags'].map(lambda x: x.split()) # The label binarizer takes all the tags and turns them into a big sparse matrix mlb = MultiLabelBinarizer() mlb.fit(pd.concat([train_labels, test_labels])) labels = mlb.transform(train_labels) # Turn the tokens into a sparse matrix vect = CountVectorizer( # Get text from html preprocessor = preprocess, # Turn the text into tokens tokenizer = tokenize, # Generate ngrams ngram_range = (1, 2), # Remove extremely common tokens max_df = 0.5, # Remove extremely uncommon tokens min_df = 0.001 )