def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array(self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df = 1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def fit_images(): client = pymongo.MongoClient('localhost', 27017) db = client['image_annotation'] responses = db['mapped_responses'].find() no_labels = db['labels_binary'].find() numbers = [] for i in no_labels: numbers.append(set([int(i["number"])])) train_data = [] labels = [] i=0 mlb = MultiLabelBinarizer() mlb.fit(numbers) for index, instance in enumerate(responses): t_data = instance['hist']['0'] indexes[index] = instance['image_no'] train_data.append(t_data) label = instance['binary_results'] new_labels = [] for key, value in enumerate(label): value1 = int(value) new_labels.append(set([value1])) new_labels = mlb.transform(new_labels) labels.append(label) classifier = KNeighborsClassifier(n_neighbors = 5, weights='uniform') classifier.fit(train_data, labels) build_dir = getBuildDir() pickle.dump(classifier, open(join(build_dir, 'model.data'),'w'),protocol=1) client.close()
def test_multilabel_classification_report(): n_classes = 4 n_samples = 50 make_ml = make_multilabel_classification _, y_true_ll = make_ml(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples) _, y_pred_ll = make_ml(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples) expected_report = """\ precision recall f1-score support 0 0.50 0.67 0.57 24 1 0.51 0.74 0.61 27 2 0.29 0.08 0.12 26 3 0.52 0.56 0.54 27 avg / total 0.45 0.51 0.46 104 """ lb = MultiLabelBinarizer() lb.fit([range(4)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
class TimeSeriesLabelTransformer(BaseTaskTransformer): def __init__(self, namespace, name, labels=None): '''Initialize a time-series label transformer Parameters ---------- jam : jams.JAMS The JAMS object container n_samples : int > 0 The number of samples in the audio frame label_encoder : sklearn.preprocessing.MultiLabelBinarizer The (pre-constructed) label encoder ''' super(TimeSeriesLabelTransformer, self).__init__(namespace, 0) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.name = name def transform(self, jam): ann = self.find_annotation(jam) intervals = np.asarray([[0.0, jam.file_metadata.duration]]) values = [None] mask = False if ann: ann_int, ann_val = ann.data.to_interval_values() intervals = np.vstack([intervals, ann_int]) values.extend(ann_val) mask = True # Suppress all intervals not in the encoder tags = [] for v in values: if v in self._classes: tags.extend(self.encoder.transform([[v]])) else: tags.extend(self.encoder.transform([[]])) tags = np.asarray(tags) target = self.encode_intervals(jam.file_metadata.duration, intervals, tags) return {'output_{:s}'.format(self.name): target, 'mask_{:s}'.format(self.name): mask}
def test_multilabelbinarizer_vs_sklearn(): # Compare msmbuilder.preprocessing.MultiLabelBinarizer # with sklearn.preprocessing.MultiLabelBinarizer multilabelbinarizerr = MultiLabelBinarizerR() multilabelbinarizerr.fit(np.concatenate(trajs)) multilabelbinarizer = MultiLabelBinarizer() multilabelbinarizer.fit(trajs) y_ref1 = multilabelbinarizerr.transform(trajs[0]) y1 = multilabelbinarizer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def load_data(): labels=pd.read_csv("train.csv") bismatch=pd.read_csv("train_photo_to_biz_ids.csv") labels=bismatch.merge(labels,how='left',on='business_id') labels=labels[pd.isnull(labels['labels'])==False] labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) training_=os.listdir("train_photos/train244") train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]}) train_ids=train_ids.merge(labels,on='photo_id',how='inner') # val_ids=val_ids.merge(labels,on='photo_id',how='inner') mlb=MultiLabelBinarizer() mlb.fit(train_ids['labels'].tolist()) # X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32) # X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32) return train_ids,mlb
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None, multilabel=False): print "prepping the Word Tokenizer..." _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions) if n_sbu: _4, sbuY, _5 = sbuXYFilenames(n_sbu) trY.extend(sbuY) vect = Tokenizer(min_df=min_df, max_features=max_features) captions = sampleCaptions(trY, n_captions) vect.fit(captions) if multilabel: mlb = MultiLabelBinarizer() mlb.fit(vect.transform(captions)) return vect, mlb # if not multilabel: return vect
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels): all_labels = ["Drought", "Earthquake", "Flood", "Epidemic", "Hurricane", \ "Rebellion", "Terrorism", "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] disaster_labels = ["Drought", "Earthquake", "Flood", "Hurricane", \ "Tornado", "Tsunami", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] health_labels = ["Epidemic", "displaced_people_and_evacuations", \ "donation_needs_or_offers_or_volunteering_services", \ "injured_or_dead_people"] conflict_labels = ["Rebellion", "Terrorism", "displaced_people_and_evacuations", \ "infrastructure_and_utilities_damage", \ "injured_or_dead_people", "missing_trapped_or_found_people"] import numpy as np curr_labels = all_labels trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels] testLabels = [list(set(l).intersection(curr_labels))for l in testLabels] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=curr_labels) train_label_matrix = mlb.fit(trainLabels) print("Labels : ", mlb.classes_) train_label_matrix = mlb.transform(trainLabels) test_label_matrix = mlb.transform(testLabels) print("Shape of label matrix : ", test_label_matrix.shape) train_matrix, tfidf = tf_idf_fit_transform(trainSentences) test_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", test_matrix.shape) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier # estimator = LinearSVC() estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1) classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, train_label_matrix) predictions = classifier.predict(test_matrix) from sklearn.metrics import f1_score, precision_score, recall_score print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average=None)) print("Macro-Recall", recall_score(test_label_matrix, predictions, average=None)) print("Macro-F1", f1_score(test_label_matrix, predictions, average=None))
class GlobalLabelTransformer(BaseTaskTransformer): def __init__(self, namespace, name, labels=None): '''Initialize a global label transformer Parameters ---------- jam : jams.JAMS The JAMS object container ''' super(GlobalLabelTransformer, self).__init__(namespace, 0) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.name = name def transform(self, jam): ann = self.find_annotation(jam) intervals = np.asarray([[0, 1]]) values = [None] mask = False if ann: values = list(ann.data.value) intervals = np.tile(intervals, [len(values), 1]) mask = True # Suppress all intervals not in the encoder tags = [v for v in values if v in self._classes] if len(tags): target = self.encoder.transform([tags]).max(axis=0) else: target = np.zeros(len(self._classes), dtype=np.int) return {'output_{:s}'.format(self.name): target, 'mask_{:s}'.format(self.name): mask}
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
datasets_single, queries_single = zip(*preprocessed_data_list_single) q_fasttext = preprocessing.fast_text_embeddings(queries) documentation_file_parameteropt.write("fasttext Evaluation \n") documentation_file_modelopt.write("fasttext Evaluation \n") #split data in training and test data d_train, d_test, q_train, q_test = train_test_split(datasets, q_fasttext, test_size=0.2) #encode labels, for abstracts with MultiLabelBinarizer as one sample can have multiple labels, for #citation contexts use LabelEncoder label_encoder = MultiLabelBinarizer() #label_encoder = LabelEncoder() label_encoder.fit(datasets) d_train_encoded = label_encoder.transform(d_train) pickle.dump(label_encoder, open('label_encoder_fasttext.sav', 'wb')) #Linear SVM: optimizing parameters with grid search print("SVM model evaluation") svm_dict = dict(estimator__C=[1, 2, 5, 10, 50, 100]) classifier_svm = RandomizedSearchCV(estimator=OneVsRestClassifier( svm.LinearSVC()), param_distributions=svm_dict, n_iter=5, n_jobs=1) classifier_svm.fit(np.asarray(q_train), np.asarray(d_train_encoded)) documentation_file_parameteropt.write( "Linear SVM: Best parameters {}, reached score: {} \n".format( classifier_svm.best_params_, classifier_svm.best_score_))
train_json = json.load(open('train.json')) #filenames=["./imaterial_train/"+str(i)+".jpg" for i in range(1,201)] #print(filenames) y_train = [] for i in range(1, 201): labels = train_json['annotations'][i]['labelId'] labels = np.array(list(map(int, labels))) y_train.append(labels) y_train = np.array(y_train) all_labels = [] for i in range(1, 229): all_labels.append([i]) mlb.fit(all_labels) # fitting multilabelbinarizer to all labels y_train = mlb.transform(y_train) #print('smallest vertical:',min(i.shape[0] for i in X_train)) #print('smallest horizontal:',min(i.shape[1] for i in X_train)) #All images resized to the smallest dimensions X_train_resized = [transform.resize(img, (200, 128, 3)) for img in X_train] X_train_flat = np.array([img.flatten() for img in X_train_resized]) os.chdir('./imaterial_validation') X_test = np.array([io.imread(str(i) + '.jpg') for i in range(1, 201)]) os.chdir('..') X_test_resized = [transform.resize(img, (200, 128, 3)) for img in X_test] X_test_flat = np.array([img.flatten() for img in X_test_resized])
# print(firstlast) def tags_for_question(question_id): return df_tags[df_tags['Id'] == question_id].Tag.values def add_tags_column(row): row['Tags'] = tags_for_question(row['Id']) return row df_questions = df_text.apply(add_tags_column, axis=1) # print(df_questions[['Id', 'Text', 'Tags']].head()) multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(df_questions.Tags) Y = multilabel_binarizer.transform(df_questions.Tags) count_vect = CountVectorizer() X_counts = count_vect.fit_transform(df_questions.Text.values.astype('U')) tfidf_transformer = TfidfTransformer() X_tfidf = tfidf_transformer.fit_transform(X_counts) ros = RandomOverSampler(random_state=9000) X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y) x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=9000)
>>> x=pd.DataFrame([["I have 12345678 tomatoes"],["12345678"]) SyntaxError: invalid syntax >>> x=pd.DataFrame([["I have 12345678 tomatoes"],["12345678"]]) >>> x 0 0 I have 12345678 tomatoes 1 12345678 >>> x.T 0 1 0 I have 12345678 tomatoes 12345678 >>> x=pd.DataFrame([["I have 12345678 tomatoes","12345678"]]) >>> x 0 1 0 I have 12345678 tomatoes 12345678 >>> binarizer = MultiLabelBinarizer.fit(x) Traceback (most recent call last): File "<pyshell#13>", line 1, in <module> binarizer = MultiLabelBinarizer.fit(x) TypeError: fit() missing 1 required positional argument: 'y' >>> binarizer = MultiLabelBinarizer.fit(x[0]) Traceback (most recent call last): File "<pyshell#14>", line 1, in <module> binarizer = MultiLabelBinarizer.fit(x[0]) TypeError: fit() missing 1 required positional argument: 'y' >>> binarizer = MultiLabelBinarizer.fit(x[0],x[1]) Traceback (most recent call last): File "<pyshell#15>", line 1, in <module> binarizer = MultiLabelBinarizer.fit(x[0],x[1]) File "/Users/montana/miniconda3/lib/python3.6/site-packages/sklearn/preprocessing/label.py", line 696, in fit if self.classes is None:
def get_or_make_label_encoder(params, problem, mode, label_list=None, zero_class=None): """Simple function to create or load existing label encoder If mode is train, alway create new label_encder Arguments: problem {str} -- problem name mode {mode} -- mode Keyword Arguments: label_list {list} -- label list to fit the encoder (default: {None}) zero_class {str} -- what to assign as 0 (default: {'O'}) Returns: LabelEncoder -- label encoder """ if label_list is None: return None problem_path = params.ckpt_dir create_path(problem_path) le_path = os.path.join(problem_path, '%s_label_encoder.pkl' % problem) is_seq2seq_text = params.problem_type[problem] == 'seq2seq_text' is_multi_cls = params.problem_type[problem] == 'multi_cls' is_seq2seq_tag = params.problem_type[problem] == 'seq2seq_tag' if mode == 'train' and not os.path.exists(le_path): if is_seq2seq_text: label_encoder = load_transformer_tokenizer( params.transformer_tokenizer_name) pickle.dump(label_encoder, open(le_path, 'wb')) elif is_multi_cls: label_encoder = MultiLabelBinarizer() label_encoder.fit(label_list) pickle.dump(label_encoder, open(le_path, 'wb')) else: if isinstance(label_list[0], list): label_list = [ item for sublist in label_list for item in sublist ] if is_seq2seq_tag: label_list.extend([BOS_TOKEN, EOS_TOKEN]) label_encoder = LabelEncoder() label_encoder.fit(label_list, zero_class=zero_class) label_encoder.dump(le_path) else: if is_seq2seq_text or is_multi_cls: label_encoder = pickle.load(open(le_path, 'rb')) else: label_encoder = LabelEncoder() label_encoder.load(le_path) if not is_seq2seq_text: if is_multi_cls: params.num_classes[problem] = label_encoder.classes_.shape[0] else: params.num_classes[problem] = len(label_encoder.encode_dict) if EOS_TOKEN in label_encoder.encode_dict: params.eos_id[problem] = int( label_encoder.transform([EOS_TOKEN])[0]) else: params.num_classes[problem] = len(label_encoder.vocab) params.eos_id[problem] = label_encoder.convert_tokens_to_ids( [EOS_TOKEN]) return label_encoder
class HumanDataset(Dataset): def __init__(self, images_df, base_path, augument=True, mode="train"): if not isinstance(base_path, pathlib.Path): base_path = pathlib.Path(base_path) self.images_df = images_df.copy() self.augument = augument self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x) self.mlb = MultiLabelBinarizer( classes=np.arange(0, config.num_classes)) self.mlb.fit(np.arange(0, config.num_classes)) self.mode = mode def __len__(self): return len(self.images_df) def __getitem__(self, index): X = self.read_images(index) if not self.mode == "test": labels = np.array( list(map(int, self.images_df.iloc[index].Target.split(' ')))) y = np.eye(config.num_classes, dtype=np.float)[labels].sum(axis=0) else: y = str(self.images_df.iloc[index].Id.absolute()) if self.augument: X = self.augumentor(X) #X = T.Compose([T.ToPILImage(),T.ToTensor(),T.Normalize([0.08069, 0.05258, 0.05487, 0.08282], [0.13704, 0.10145, 0.15313, 0.13814])])(X) X = T.Compose([T.ToPILImage(), T.ToTensor()])(X) return X.float(), y def read_images(self, index): row = self.images_df.iloc[index] filename = str(row.Id.absolute()) #use only rgb channels if config.channels == 4: images = np.zeros(shape=(512, 512, 4)) else: images = np.zeros(shape=(512, 512, 3)) r = np.array(Image.open(filename + "_red.png")) g = np.array(Image.open(filename + "_green.png")) b = np.array(Image.open(filename + "_blue.png")) y = np.array(Image.open(filename + "_yellow.png")) images[:, :, 0] = r.astype(np.uint8) images[:, :, 1] = g.astype(np.uint8) images[:, :, 2] = b.astype(np.uint8) if config.channels == 4: images[:, :, 3] = y.astype(np.uint8) images = images.astype(np.uint8) #images = np.stack(images,-1) if config.img_height == 512: return images else: return cv2.resize(images, (config.img_weight, config.img_height)) def augumentor(self, image): augment_img = iaa.Sequential([ iaa.OneOf([ iaa.Affine(rotate=90), iaa.Affine(rotate=180), iaa.Affine(rotate=270), iaa.Affine(shear=(-16, 16)), iaa.Fliplr(0.5), iaa.Flipud(0.5), ]) ], random_order=True) image_aug = augment_img.augment_image(image) return image_aug
# unique_tags = [] # with open("../logs/tags.txt") as top_tag_list: # for line in top_tag_list: # line = line.split('\n')[0] # if cnt[line] > 0: # unique_tags.append(line) # for key in data: # for tag in data[key]: # if tag not in unique_tags: # data[key].remove(tag) tags = data.values() mlb = MultiLabelBinarizer() mlb.fit(tags) print("Saving trained LabelBinarizer to disk") joblib.dump(mlb, '../dump/pkl/' + str(mlb)[:5] + '.pkl') print("") # Split corpus into training and test sets questions_train, questions_test, tags_train, tags_test = train_test_split(questions, tags, test_size=0.2, random_state = random.randint(1, 100)) print("Extracting features from the training data using the vectorizer") t0 = time() X_train = vectorizer.transform(questions_train) duration = time() - t0 print("done in %fs" % (duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print("")
import os import pathlib import pandas as pd import numpy as np import matplotlib.image as mpimg from sklearn.preprocessing import MultiLabelBinarizer from PIL import Image import torch from torchvision import datasets, transforms, models from torch.utils import data CLASSES = np.arange(0, 28) multilabel_binarizer = MultiLabelBinarizer(CLASSES) multilabel_binarizer.fit(CLASSES) INPUT_DIR = '../input' TRAIN_IMAGES_DIR = pathlib.Path(INPUT_DIR, 'train').as_posix() TEST_IMAGES_DIR = pathlib.Path(INPUT_DIR, 'test').as_posix() TARGETS_COLUMN_NAME = 'Target' COLORS = ('red', 'green', 'blue', 'yellow') IMAGE_FILE_EXT = 'png' class HumanProteinAtlasDataset(data.Dataset): def __init__(self, images_description_df, transform=None, train_mode=True): self.images_description_df = images_description_df.copy() self.transform = transform self.train_mode = train_mode
def binarizer_labels(data): multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(data) # transform target variable y = multilabel_binarizer.transform(data) return y, multilabel_binarizer
dataset['genre_list'] = dataset['genre_list'].apply( lambda x: ast.literal_eval(x)) train['genre_list'] = train['genre_list'].apply(lambda x: ast.literal_eval(x)) test['genre_list'] = test['genre_list'].apply(lambda x: ast.literal_eval(x)) val['genre_list'] = val['genre_list'].apply(lambda x: ast.literal_eval(x)) labels = {} for genre in test['genre_list']: if len(genre) in labels: labels[len(genre)] += 1 else: labels[len(genre)] = 1 mlb = MultiLabelBinarizer() mlb.fit(dataset['genre_list'].tolist()) transformed_labels = mlb.fit_transform(dataset['genre_list'].tolist()) train_labels = mlb.transform(train['genre_list'].tolist()) test_labels = mlb.transform(test['genre_list'].tolist()) val_labels = mlb.transform(val['genre_list'].tolist()) stop = stopwords.words('english') lemmatizer = WordNetLemmatizer() def clean_text(text): text = text.translate(str.maketrans('', '', punctuation))
def preprocess_data(args): label_counter = Counter([]) examples_per_file = Counter() print("Reading all files for labels.") for input_file in args.input_files: with xopen(input_file, "rt") as f: for example, labels in input_readers[args.task](f): examples_per_file[input_file] += 1 label_counter.update(labels) if args.top_n_labels > 0: mlb_full = MultiLabelBinarizer(sparse_output=True) mlb_full = mlb_full.fit(label_counter.keys()) label_counter = dict(label_counter.most_common(args.top_n_labels)) mlb = MultiLabelBinarizer(sparse_output=True) # Passing a list in a list because that's what the function wants. if args.labels_in: labels = json.load(open(args.labels_in)) mlb = mlb.fit([labels]) else: mlb = mlb.fit([[pair for pair in label_counter]]) # Save list of partial -> full mapping if doing top N labels. if args.top_n_labels > 0: label_mapping = np.where(np.in1d(mlb_full.classes_, mlb.classes_))[0].tolist() with xopen(args.label_mapping, "wt") as f: f.write(json.dumps(label_mapping)) # Also save the full labels. with xopen(args.full_labels, "wt") as f: f.write(json.dumps(list(mlb_full.classes_))) # Save list of labels. with xopen(args.labels_out, "wt") as f: f.write(json.dumps(list(mlb.classes_))) # Set parallel tokenization thread count. os.environ["RAYON_NUM_THREADS"] = str(args.processes) from tokenizers import Tokenizer, decoders, trainers from tokenizers.models import WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import BertProcessing if args.task == 'cafa': # Define our custom tokenizer. # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences. tokenizer = WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=20000) tokenizer = Tokenizer(tokenizer) tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"]) tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case) tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.post_processor = BertProcessing( ("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]"))) tokenizer.decoder = decoders.WordPiece(prefix='##') else: tokenizer = BertWordPieceTokenizer(args.vocab, lowercase=args.do_lower_case) tokenizer.enable_padding(max_length=args.seq_len) tokenizer.enable_truncation(max_length=args.seq_len) for input_file in args.input_files: with xopen(input_file, 'rt') as in_f: file_name = generate_out_filename(input_file, args) with xopen(file_name, "wt") as out_f: print("Processing to: ", file_name) # Write the shape as the first row, useful for the finetuning. if args.labels_in: n_labels = len(json.load(open(args.labels_in))) else: n_labels = len(label_counter) out_f.write( json.dumps((examples_per_file[input_file], n_labels)) + '\n') batch_size = min(examples_per_file[input_file], args.processes * 100) example_batch = [] labels_batch = [] doc_idx_batch = [] with ParallelGenerator(input_readers[args.task](in_f), max_lookahead=batch_size) as g: START_POS = int(args.window_start) / 100 for doc_idx, (example, labels) in enumerate(g): #example = ' '.join(example.split(' ')[-510:]) example_batch.append(example) labels_batch.append(labels) doc_idx_batch.append(doc_idx) if len(example_batch) == batch_size: example_batch = tokenizer.encode_batch( example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps( [block, labels, doc_idx]) + '\n') else: window = get_window(example, START_POS) assert len(window) == 512 assert all( [type(y) is int for y in window]) out_f.write( json.dumps([window, labels]) + '\n') example_batch = [] labels_batch = [] # Write out whatever is left in the last smaller batch. example_batch = tokenizer.encode_batch(example_batch) labels_batch = mlb.transform(labels_batch) for example, labels, doc_idx in zip( example_batch, labels_batch, doc_idx_batch): # Convert sparse arrays to python lists for json dumping. # print(labels);input() labels = labels.nonzero()[1].tolist() """try: [][0] print("DOC_LEN:",len(example.overflowing)+1) mid = len(example.overflowing)//2 out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n') except IndexError: out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')""" if args.all_blocks or args.n_blocks > 0: blocks = [example.ids] + [ blk.ids for blk in example.overflowing ] #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks]))) for b, block in enumerate(blocks, 2): if b > args.n_blocks and args.n_blocks > 0: break out_f.write( json.dumps([block, labels, doc_idx]) + '\n') else: out_f.write( json.dumps( [get_window(example, START_POS), labels]) + '\n')
class DynamicLabelTransformer(BaseTaskTransformer): '''Time-series label transformer. Attributes ---------- name : str The name of this transformer object namespace : str The JAMS namespace for this task labels : list of str [optional] The list of labels for this task. If not provided, it will attempt to infer the label set from the namespace definition. sr : number > 0 The audio sampling rate hop_length : int > 0 The hop length for annotation frames See Also -------- StaticLabelTransformer ''' def __init__(self, name, namespace, labels=None, sr=22050, hop_length=512): super(DynamicLabelTransformer, self).__init__(name=name, namespace=namespace, sr=sr, hop_length=hop_length) if labels is None: labels = jams.schema.values(namespace) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.register('tags', [None, len(self._classes)], np.bool) def empty(self, duration): '''Empty label annotations. Constructs a single observation with an empty value (None). Parameters ---------- duration : number > 0 The duration of the annotation ''' ann = super(DynamicLabelTransformer, self).empty(duration) ann.append(time=0, duration=duration, value=None) return ann def transform_annotation(self, ann, duration): '''Transform an annotation to dynamic label encoding. Parameters ---------- ann : jams.Annotation The annotation to convert duration : number > 0 The duration of the track Returns ------- data : dict data['tags'] : np.ndarray, shape=(n, n_labels) A time-varying binary encoding of the labels ''' intervals, values = ann.to_interval_values() # Suppress all intervals not in the encoder tags = [] for v in values: if v in self._classes: tags.extend(self.encoder.transform([[v]])) else: tags.extend(self.encoder.transform([[]])) tags = np.asarray(tags) target = self.encode_intervals(duration, intervals, tags) return {'tags': target} def inverse(self, encoded, duration=None): '''Inverse transformation''' ann = jams.Annotation(namespace=self.namespace, duration=duration) for start, end, value in self.decode_intervals(encoded, duration=duration): # Map start:end to frames f_start, f_end = time_to_frames([start, end], sr=self.sr, hop_length=self.hop_length) confidence = np.mean(encoded[f_start:f_end + 1, value]) value_dec = self.encoder.inverse_transform(np.atleast_2d(value))[0] for vd in value_dec: ann.append(time=start, duration=end - start, value=vd, confidence=confidence) return ann
class StaticLabelTransformer(BaseTaskTransformer): '''Static label transformer. Attributes ---------- name : str The name of this transformer object namespace : str The JAMS namespace for this task labels : list of str [optional] The list of labels for this task. If not provided, it will attempt to infer the label set from the namespace definition. See Also -------- DynamicLabelTransformer ''' def __init__(self, name, namespace, labels=None): super(StaticLabelTransformer, self).__init__(name=name, namespace=namespace, sr=1, hop_length=1) if labels is None: labels = jams.schema.values(namespace) self.encoder = MultiLabelBinarizer() self.encoder.fit([labels]) self._classes = set(self.encoder.classes_) self.register('tags', [len(self._classes)], np.bool) def transform_annotation(self, ann, duration): '''Transform an annotation to static label encoding. Parameters ---------- ann : jams.Annotation The annotation to convert duration : number > 0 The duration of the track Returns ------- data : dict data['tags'] : np.ndarray, shape=(n_labels,) A static binary encoding of the labels ''' intervals = np.asarray([[0, 1]]) values = list([obs.value for obs in ann]) intervals = np.tile(intervals, [len(values), 1]) # Suppress all intervals not in the encoder tags = [v for v in values if v in self._classes] if len(tags): target = self.encoder.transform([tags]).astype(np.bool).max(axis=0) else: target = np.zeros(len(self._classes), dtype=np.bool) return {'tags': target} def inverse(self, encoded, duration=None): '''Inverse static tag transformation''' ann = jams.Annotation(namespace=self.namespace, duration=duration) if np.isrealobj(encoded): detected = (encoded >= 0.5) else: detected = encoded for vd in self.encoder.inverse_transform(np.atleast_2d(detected))[0]: vid = np.flatnonzero(self.encoder.transform(np.atleast_2d(vd))) ann.append(time=0, duration=duration, value=vd, confidence=encoded[vid]) return ann
buki_data = pd.read_csv(f'{ORIG_DIR}/statink-weapon2.csv') # カテゴリ名称を変更比較用 display(buki_data.loc[buki_data['category2'] == 'maneuver'].head(3)) # カテゴリ名称とブキ名称が被るためカテゴリ名称を変更する buki_data.loc[buki_data['category2'] == 'maneuver', 'category2'] = 'maneuver_cat' display(buki_data.loc[buki_data['category2'] == 'maneuver_cat'].head(3)) # + #https://prob.space/competitions/game_winner from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() # マルチラベル形式でonehot化(複数列で1つく) mlb.fit([set(train['A1-weapon'].unique())]) MultiLabelBinarizer(classes=None, sparse_output=False) def trans_weapon(df, columns=['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon']): """指定列をonehot化""" weapon = df.fillna('none') weapon_binarized = mlb.transform(weapon[columns].values) return pd.DataFrame(weapon_binarized, columns=mlb.classes_) def make_input_output(df, with_y=False): """各武器列をonehot化して結合""" a_weapon = trans_weapon( df, ['A1-weapon', 'A2-weapon', 'A3-weapon', 'A4-weapon'])
class SpamDataset(torch.utils.data.Dataset): ''' maxlength = size of padding ''' def __init__(self, maxlength = 30, subset='all'): super(SpamDataset, self) self.maxlength = maxlength self.subset = subset self.samples, self.labels = self.load_data() self.samples_vectors = self.load_samples_vectors() def load_data(self): # do some preprocessing if preprocessed file does not exist if not os.path.isfile('SMSSpamCollection_normalized.pkl'): #import spacy; nlp=spacy.load('en') print('Applying spacy.') import en_core_web_sm nlp = en_core_web_sm.load() def normalize(message): doc = nlp(message) normalized = doc normalized = filter(lambda t : t.is_alpha and not t.is_stop, doc) normalized = map(lambda t : t.text, normalized) normalized = list(normalized) return normalized messages = pandas.read_csv( 'SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE, names=['label', 'message'], encoding='UTF-8') messages['normalized'] = messages.message.apply(normalize) messages.to_pickle('SMSSpamCollection_normalized') # load normalized messages messages = pandas.read_pickle('SMSSpamCollection_normalized') samples = messages.normalized.tolist() labels = messages.label.tolist() self.classes = list(set(labels)) self.classes.sort() #Otherwise it is inconsistent which class is 0 and which is 1 self.oh_classes = MultiLabelBinarizer() self.oh_classes.fit([[label] for label in self.classes]) self.classes = dict([(l,i) for (i,l) in enumerate(self.oh_classes.classes_)]) labels = [self.classes[l] for l in labels] cut_off = int(len(samples) * 0.2) if self.subset == 'train': samples = samples[cut_off:] labels =labels[cut_off:] elif self.subset == 'test': samples= samples[:cut_off] labels = labels[:cut_off] return samples, labels def __len__(self): return len(self.samples) """Returns (item as matrix of token vectors, label, index) instead of the normal (item, label). See constructor for more info""" def __getitem__(self, index): msg = self.samples[index] int_label = self.classes[self.labels[index]] #Convert label to integer. Is this slow? #mat = np.zeros( (self.maxlength, self.ftmodel.numpy_normalized_vectors.shape[1]) ) mat = np.zeros( (self.maxlength, 300) ) #Hard coded for the wikipedia data that has dim=300 for i, token in enumerate(msg): if i >= self.maxlength: break v = emb.getVector(token) mat[i,:] = v oh_label = self.oh_classes.transform([[self.labels[index]]]) oh_label = oh_label[0] #print(oh_label) return ( torch.tensor(mat), int_label, oh_label, index )
from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_auc_score from sklearn.metrics import f1_score import numpy as np from keywords import * path = Path.cwd() / "../../pdf-reports/" df = read_plaintext_with_keywords(path) df = add_chapter_fields(df) print(df['num_keywords'].value_counts()) df_keywords = df[df.num_keywords > 0] mlb = MultiLabelBinarizer() mlb.fit(df_keywords.keywords) print(len(mlb.classes_)) print(mlb.classes_) y = mlb.transform(df_keywords.keywords) print(y.shape) tv = TfidfVectorizer(ngram_range=(1, 2), preprocessor=preproc, stop_words='english') tv1 = TfidfVectorizer(ngram_range=(1, 1), preprocessor=preproc, stop_words='english') tv2 = TfidfVectorizer(ngram_range=(2, 2), preprocessor=preproc, stop_words='english')
try: from sklearn.preprocessing import MultiLabelBinarizer lb = MultiLabelBinarizer() except ImportError, e: from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() TRIM_SAMPLES = len(tags) # / 10 tags = tags[:TRIM_SAMPLES] learn_data = learn_data[:TRIM_SAMPLES] lb.fit(tags) labels = lb.transform(tags) print "using\t", TRIM_SAMPLES, "samples" print "\t", len(keywords), "keywords" print "\t", len(lb.classes_), "tags" metadata = learn_data.sum(axis=1) print "\t", metadata.mean(), "avg words in document" print "\t", metadata.max(), "biggest document" print "\t", metadata.min(), "smallest document" # plt.figure(figsize=(8, 6)) # plot_subfigure(learn_data, labels, 1, "With unlabeled samples + CCA", "cca") # plot_subfigure(learn_data, labels, 2, "With unlabeled samples + PCA", "pca")
lambda x: len(x) == 10)] # tags가 10개인 데이터만을 사용 train = train[:5000] # 5000개 학습 데이터 샘플 사용 n_tags = 500 # 최다 등장 500개 tags만을 사용 n_titles = 500 # 최다 등장 500개 title 형태소만을 사용 undup_tags = np.array((pd.Series(np.concatenate( train['tags'].values)).value_counts()[:n_tags].index)) undup_gnr = np.unique(np.concatenate(meta['song_gn_gnr_basket'])) undup_dtl_gnr = np.unique(np.concatenate(meta['song_gn_dtl_gnr_basket'])) undup_title = np.array( (pd.Series(np.concatenate( decom_train['keywords'].values)).value_counts()[:n_titles].index)) enc = MultiLabelBinarizer() enc_gnr = MultiLabelBinarizer() enc_dtl_gnr = MultiLabelBinarizer() enc_title = MultiLabelBinarizer() enc.fit([undup_tags]) enc_gnr.fit([undup_gnr]) enc_dtl_gnr.fit([undup_dtl_gnr]) enc_title.fit([undup_title]) class MF(): def __init__(self, rating_mat, dim_latent, l2, alpha, l_rate, n_epochs): self.r_mat = rating_mat self.n_users, self.n_items = rating_mat.shape self.dim_latent = dim_latent self.l2 = l2 self.alpha = alpha self.l_rate = l_rate self.n_epochs = n_epochs
# Main Code source_data_dir = '../train_data' test_data_dir = '../test_data' data_dir = 'copied' print('pre stage start') print('read data start') if os.path.exists(data_dir): shutil.rmtree(data_dir) copy_data(source_data_dir, data_dir) log_collection = read_data(data_dir) test_data = read_data(test_data_dir) print('read data end') print('prepare data start') multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(log_collection['labels']) y_train = multilabel_binarizer.transform(log_collection['labels']) y_test = multilabel_binarizer.transform(test_data['labels']) X_train = log_collection['clean_text'] X_test = test_data['clean_text'] id_train = np.arange(len(log_collection['clean_text'])) id_test = np.arange(len(test_data['clean_text'])) X_train, X_test, tfidf_vocab = prepare_data(X_train, X_test) tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()} print('prepare data end') # Training print('train model start') model = train(X_train, y_train)
# image_ids.append(path[start:end]) print('image_ids:', image_ids) data = pd.read_csv("MovieGenre.csv", encoding="ISO-8859-1") y = [] parsed_movies = [] classes = utils.list_genres(7) # classes = set(classes) print('classes:', classes) print(len(classes)) y = utils.list_movies(classes, image_ids) print('y:', y) mlb = MultiLabelBinarizer() #print('mlb:',mlb) mlb.fit(y) #print('mlb:',mlb) y = mlb.transform(y) print('y:', y) print('y.shape:', y.shape) x = [] x.append(utils.get_image(img_path)) x = np.asarray(x) print('x.shape:', x.shape) def predict(X): init = tf.global_variables_initializer() x, _ = utils.create_placeholders(150, 150, 3, 7)
bount=0 for i in range(0,len(training_all)): image_id=training_all[i][0] count +=1 label_str=training_all[i][1] cur_label_list=label_str.split() cur_label_list=tuple([int(ff) for ff in cur_label_list]) label_list.append(cur_label_list) image_id_list.append(image_id) mlb = MultiLabelBinarizer() mlb.fit(label_list) X_train_name, X_test_name, y_train_label, y_test_label = train_test_split\ (image_id_list, label_list, test_size=0.015) train_label=mlb.transform(y_train_label) y_test=mlb.transform(y_test_label) bb_image=[] for i_name in X_test_name: cur_image=load_image(basepath,i_name)
class DataGenerator(Sequence): def __init__(self, data_path, tokenize_func, options, bg_data_path=None, bg_sample_rate=1., max_chars=None, label_encoder=None): texts, labels = load_data(data_path, options, max_chars=max_chars) self.num_examples = len(texts) self.batch_size = options.batch_size #self.seq_len = options.seq_len self.X = tokenize_func(texts) if label_encoder is None: self.label_encoder = MultiLabelBinarizer() self.label_encoder.fit(labels) else: self.label_encoder = label_encoder self.Y = self.label_encoder.transform(labels) self.num_labels = len(self.label_encoder.classes_) if bg_data_path is not None: self.bg_sample_rate = bg_sample_rate self.bg_num_examples, self.bg_X, self.bg_Y = [], [], [] for path in bg_data_path.split(): bg_texts, bg_labels = load_data(path, options, max_chars=max_chars) self.bg_num_examples.append(len(bg_texts)) self.bg_X.append(tokenize_func(bg_texts)) self.bg_Y.append(self.label_encoder.transform(bg_labels)) #self.bg_num_labels = len(self.label_encoder.classes_) self.bg_num_corpora = len(self.bg_num_examples) else: self.bg_sample_rate = 0 self.bg_num_examples = [0] self.bg_num_corpora = 0 self.on_epoch_end() def on_epoch_end(self): self.indexes = np.arange(self.num_examples) np.random.shuffle(self.indexes) if self.bg_sample_rate > 0: if hasattr(self, "bg_indexes"): for i, bg_indexes in enumerate(self.bg_indexes): seen_bg_idxs = bg_indexes[:len(self.indexes)] unseen_bg_idxs = bg_indexes[len(self.indexes):] np.random.shuffle(seen_bg_idxs) self.bg_indexes[i] = np.concatenate([unseen_bg_idxs, seen_bg_idxs]) else: self.bg_indexes = [np.arange(x) for x in self.bg_num_examples] for i,_ in enumerate(self.bg_indexes): np.random.shuffle(self.bg_indexes[i]) self.index = 0 def __len__(self): return int((self.num_examples//self.batch_size)*(1+self.bg_sample_rate)) * self.bg_num_corpora def __getitem__(self, index): if np.random.random() <= 1/(self.bg_sample_rate+self.bg_num_corpora): batch_indexes = self.indexes[self.index*self.batch_size:(self.index+1)*self.batch_size] self.index += 1 X, Y = self.X, self.Y else: i = np.random.randint(0, self.bg_num_corpora) try: batch_indexes = self.bg_indexes[i][self.index*self.batch_size:(self.index+1)*self.batch_size] except IndexError: end = ((self.index+1)*self.batch_size) % len(self.bg_indexes[i]) if end < self.batch_size: end = self.batch_size beg = 0 else: beg = end-self.batch_size batch_indexes = self.bg_indexes[i][beg:end] X, Y = self.bg_X[i], self.bg_Y[i] batch_X = {} for key in self.X: batch_X[key] = np.empty((self.batch_size, *X[key].shape[1:])) for j, idx in enumerate(batch_indexes): batch_X[key][j] = X[key][idx] batch_y = np.empty((self.batch_size, *Y.shape[1:]), dtype=int) for j, idx in enumerate(batch_indexes): batch_y[j] = Y[idx] return batch_X, batch_y
def load_data(train_set): X_data = [] y_data = [] for c, (vector, target) in enumerate(train_set): X_data.append(vector) y_data.append(target) if c % 10000 == 0: print(c) print(len(X_data), 'training examples') class_freqs = Counter([y for y_seq in y_data for y in y_seq]).most_common() class_list = [y[0] for y in class_freqs] nb_classes = len(class_list) print(nb_classes, 'classes') class_dict = dict(zip(class_list, np.arange(len(class_list)))) with open('data_path_save/attention_blstm/class_dict.pkl', 'wb') as fp: pickle.dump(class_dict, fp) print('Exported class dictionary') y_data_int = [] for y_seq in y_data: y_data_int.append([class_dict[y] for y in y_seq]) X_data_flat = [] for raw_text in X_data: flat_text = [] for sent in raw_text: flat_text.extend(sent) X_data_flat.append(flat_text) tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=1) tokenizer.fit_on_texts(X_data_flat) X_data_int = np.zeros((len(X_data), MAX_SEQ_LEN, MAX_SENT_LENGTH)) for idx, raw_text in enumerate(X_data): sents_batch = np.zeros((MAX_SEQ_LEN, MAX_SENT_LENGTH)) tokens = tokenizer.texts_to_sequences(raw_text) sents = pad_sequences(tokens, maxlen=MAX_SENT_LENGTH, padding='post', truncating='post', dtype='float32') for j, sent in enumerate(sents): if j >= MAX_SEQ_LEN: break sents_batch[j, :] = sent X_data_int[idx, :, :] = sents_batch X_data = X_data_int print('Shape of data tensor:', X_data.shape) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) with open('data_path_save/attention_blstm/word_index.json', 'w') as fp: json.dump(word_index, fp) print('Exported word dictionary') mlb = MultiLabelBinarizer() mlb.fit([class_dict.values()]) y_data = mlb.transform(y_data_int) print('Shape of label tensor:', y_data.shape) X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, train_size=0.8, test_size=0.2, random_state=42) return X_train, X_val, y_train, y_val, nb_classes, word_index
def select_data(XX,YY, ctype, min_samples, outputfolder): # convert multilabel to multi-hot mlb = MultiLabelBinarizer() if ctype == 'diagnostic': X = XX[YY.diagnostic_len > 0] Y = YY[YY.diagnostic_len > 0] mlb.fit(Y.diagnostic.values) y = mlb.transform(Y.diagnostic.values) elif ctype == 'subdiagnostic': counts = pd.Series(np.concatenate(YY.subdiagnostic.values)).value_counts() counts = counts[counts > min_samples] YY.subdiagnostic = YY.subdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values)))) YY['subdiagnostic_len'] = YY.subdiagnostic.apply(lambda x: len(x)) X = XX[YY.subdiagnostic_len > 0] Y = YY[YY.subdiagnostic_len > 0] mlb.fit(Y.subdiagnostic.values) y = mlb.transform(Y.subdiagnostic.values) elif ctype == 'superdiagnostic': counts = pd.Series(np.concatenate(YY.superdiagnostic.values)).value_counts() counts = counts[counts > min_samples] YY.superdiagnostic = YY.superdiagnostic.apply(lambda x: list(set(x).intersection(set(counts.index.values)))) YY['superdiagnostic_len'] = YY.superdiagnostic.apply(lambda x: len(x)) X = XX[YY.superdiagnostic_len > 0] Y = YY[YY.superdiagnostic_len > 0] mlb.fit(Y.superdiagnostic.values) y = mlb.transform(Y.superdiagnostic.values) elif ctype == 'form': # filter counts = pd.Series(np.concatenate(YY.form.values)).value_counts() counts = counts[counts > min_samples] YY.form = YY.form.apply(lambda x: list(set(x).intersection(set(counts.index.values)))) YY['form_len'] = YY.form.apply(lambda x: len(x)) # select X = XX[YY.form_len > 0] Y = YY[YY.form_len > 0] mlb.fit(Y.form.values) y = mlb.transform(Y.form.values) elif ctype == 'rhythm': # filter counts = pd.Series(np.concatenate(YY.rhythm.values)).value_counts() counts = counts[counts > min_samples] YY.rhythm = YY.rhythm.apply(lambda x: list(set(x).intersection(set(counts.index.values)))) YY['rhythm_len'] = YY.rhythm.apply(lambda x: len(x)) # select X = XX[YY.rhythm_len > 0] Y = YY[YY.rhythm_len > 0] mlb.fit(Y.rhythm.values) y = mlb.transform(Y.rhythm.values) elif ctype == 'all': # filter counts = pd.Series(np.concatenate(YY.all_scp.values)).value_counts() counts = counts[counts > min_samples] YY.all_scp = YY.all_scp.apply(lambda x: list(set(x).intersection(set(counts.index.values)))) YY['all_scp_len'] = YY.all_scp.apply(lambda x: len(x)) # select X = XX[YY.all_scp_len > 0] Y = YY[YY.all_scp_len > 0] mlb.fit(Y.all_scp.values) y = mlb.transform(Y.all_scp.values) else: pass # save LabelBinarizer with open(outputfolder+'mlb.pkl', 'wb') as tokenizer: pickle.dump(mlb, tokenizer) return X, Y, y, mlb
class HumanDataset(Dataset): def __init__(self, images_df, base_path, target_shape=(512, 512), augument=True, use_yellow=False, mode="train"): if not isinstance(base_path, pathlib.Path): base_path = pathlib.Path(base_path) self.images_df = images_df.copy() self.augument = augument self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x) self.mlb = MultiLabelBinarizer(classes=np.arange(0, cfg.num_classes)) self.mlb.fit(np.arange(0, cfg.num_classes)) self.mode = mode self.target_shape = target_shape self.use_yellow = use_yellow def __len__(self): return len(self.images_df) def __getitem__(self, index): X = self.read_images(index) if not self.mode == "test": labels = np.array( list(map(int, self.images_df.iloc[index].Target.split(' ')))) y = np.eye(cfg.num_classes, dtype=np.float)[labels].sum(axis=0) else: y = str(self.images_df.iloc[index].Id.absolute()) if self.augument: X = self.augumentor(X) X = T.Compose([T.ToPILImage(), T.ToTensor()])(X) return X.float(), y def read_images(self, index): row = self.images_df.iloc[index] filename = str(row.Id.absolute()) if 'ENSG' in filename: filename = os.path.split(filename)[-1] filename = os.path.join(cfg.extra_data, filename) images = np.array(Image.open(filename + ".png")) else: r = np.array(Image.open(filename + "_red.png")) g = np.array(Image.open(filename + "_green.png")) b = np.array(Image.open(filename + "_blue.png")) images = [r, g, b] if self.use_yellow: y = np.array(Image.open(filename + "_yellow.png")) images.append(y) images = np.stack(images, axis=-1) images = images.astype(np.uint8) if self.target_shape == (512, 512) and images.shape[:2] == (512, 512): return images else: return cv2.resize(images, self.target_shape) def augumentor(self, image): sometimes = lambda aug: iaa.Sometimes(0.8, aug) augment_img = iaa.Sequential([ iaa.Fliplr(0.5), iaa.Flipud(0.5), iaa.BilateralBlur(), iaa.Affine(rotate=90), iaa.ContrastNormalization((0.8, 1.3)), sometimes( iaa.Affine(scale={ "x": (0.8, 1.2), "y": (0.8, 1.2) }, translate_percent={ "x": (-0.1, 0.1), "y": (-0.1, 0.1) }, rotate=(-30, 30), shear=(-5, 5))) ], random_order=True) image_aug = augment_img.augment_image(image) return image_aug
keywords_test = keywords.pop(-1) # We want to convert the labels into vectors. For example, if we have: # keywords = [ # ['solar', 'physics', 'astronomy'], # ['physics', 'lasers'], # ['astronomy'] # ] # this would become: # keywords_binarised = [ # [1, 1, 1, 0], # [0, 1, 0, 1], # [0, 0, 1, 0] # ] mlb = MultiLabelBinarizer() mlb.fit(keywords) keywords_vector = mlb.transform(keywords) # We generate a transform from words -> vector space. This is very similar # to the above conversion of the keywords. In this scenario, the entire # corpus from our training set is converted into an id -> word sparse- # matrix. bow_transform = CountVectorizer(analyzer=text_to_vector).fit(' '.join(text)) # We transform our corpus into the unique vector space bow_vector = bow_transform.transform(text) # We convert the vector into a term frequency - inverse document frequency # Term frequencey: f_t (number of times in a document term t exists) # Inverse document frequency: log(N/n_t) (number of documents divided by # the number of documents that
# Save and close session logger.info("Commiting changes to database.") try: db_session.commit() db_session.close() except: db_session.rollback() raise # Get the features into X, and multilabel y indicator format # -------------------------------------------------------------------- # logger.info("Preparing training and input interactions.") X_train, y_train = interactions_to_Xy_format(training.all(), selection) X_test, _ = interactions_to_Xy_format(testing, selection) mlb = MultiLabelBinarizer(classes=sorted(labels)) mlb.fit(y_train) y_train = mlb.transform(y_train) logging.info("Computing class distributions.") counter = {l: int(c) for l, c in zip(mlb.classes, y_train.sum(axis=0))} counter["n_samples"] = int(y_train.shape[0]) json.dump(counter, fp=open("{}/training_distribution.json".format(direc), 'w'), indent=4, sort_keys=True) logger.info("Computing usable feature proportions in testing samples.") def separate_features(row): features = row[0].upper().split(',') interpro = set(term for term in features if 'IPR' in term)
def run_classifierAccuracy(trainSentences, trainLabels, testSentences, testLabels): all_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \ 'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \ 'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \ 'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \ 'terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \ 'aviation_hazard', 'train_collision', 'industrial_accident', \ 'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \ 'epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] disaster_labels = ['tsunami', 'heat_wave', 'cold_wave', 'forest_fire', 'limnic_erruptions', \ 'storm', 'avalanches', 'blizzard', 'earthquake', 'floods', 'hurricane', \ 'drought', 'volcano', 'fire', 'cyclone', 'hail_storms', 'land_slide', \ 'intensity', 'epicentre', 'temperature', 'depth', 'speed', 'magnitude', \ 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] health_labels = ['epidemic', 'famine', 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] conflict_labels = ['terrorist_attack', 'suicide_attack', 'normal_bombing', 'shoot_out', \ 'aviation_hazard', 'train_collision', 'industrial_accident', \ 'vehicular_collision', 'surgical_strikes', 'transport_hazards', 'riots', \ 'time', 'place', 'type', 'reason', 'after_effects', \ 'casualties', 'name', 'participant'] import numpy as np curr_labels = set(all_labels) trainLabels = [list(set(l).intersection(curr_labels)) for l in trainLabels] curr_labels = [] for l in trainLabels: curr_labels.extend(l) curr_labels = set(curr_labels) testLabels = [list(set(l).intersection(curr_labels))for l in testLabels] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer(classes=list(curr_labels)) train_label_matrix = mlb.fit(trainLabels) print("Labels : ", mlb.classes_) train_label_matrix = mlb.transform(trainLabels) test_label_matrix = mlb.transform(testLabels) print("Shape of label matrix : ", test_label_matrix.shape) train_matrix, tfidf = tf_idf_fit_transform(trainSentences) test_matrix = tfidf.transform(testSentences) print("Shape of sentence matrix : ", test_matrix.shape) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier estimator = LinearSVC() # estimator = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0, n_jobs = -1) classifier = OneVsRestClassifier(estimator, n_jobs=-1) classifier.fit(train_matrix, train_label_matrix) predictions = classifier.predict(test_matrix) from sklearn.metrics import f1_score, precision_score, recall_score print("All-Precision", precision_score(test_label_matrix, predictions, average=None)) print("All-Recall", recall_score(test_label_matrix, predictions, average=None)) print("All-F1", f1_score(test_label_matrix, predictions, average=None)) print("Micro-Precision", precision_score(test_label_matrix, predictions, average='micro')) print("Micro-Recall", recall_score(test_label_matrix, predictions, average='micro')) print("Micro-F1", f1_score(test_label_matrix, predictions, average='micro')) print("Macro-Precision", precision_score(test_label_matrix, predictions, average='macro')) print("Macro-Recall", recall_score(test_label_matrix, predictions, average='macro')) print("Macro-F1", f1_score(test_label_matrix, predictions, average='macro'))
#freq_words(train['clean_plot'], 50) stop_words = set(stopwords.words('english')) def remove_stopwords(text): no_stopword_text = [w for w in text.split() if not w in stop_words] return ' '.join(no_stopword_text) train['clean_plot'] = train['clean_plot'].apply(lambda x: remove_stopwords(x)) #freq_words(train['clean_plot'], 50) multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(train['Genre']) Y = multilabel_binarizer.transform(train['Genre']) X_train, X_test, y_train, y_test = train_test_split( train['clean_plot'], Y, test_size=0.2, shuffle=True, random_state=np.random.randint(1000)) #X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33) # 0.25 x 0.8 = 0.2 X_train.shape y_train.shape #X_val.shape #y_val.shape
class NewsGroupDataset(torch.utils.data.Dataset): ''' maxlength = size of padding ''' def __init__(self, subset='all', maxlength = 30): super(NewsGroupDataset, self) self.maxlength = maxlength self.subset = subset self.file_name = "20_newsgroup_normalized_" + subset self.samples, self.labels = self.load_data() self.samples_vectors = self.load_samples_vectors() def load_data(self): messages = fetch_20newsgroups(subset=self.subset, remove=('headers','footers','quotes'), shuffle=True, random_state=42) # do some preprocessing if preprocessed file does not exist if not os.path.isfile(self.file_name): #import spacy; nlp=spacy.load('en') print('Applying spacy.') import en_core_web_sm nlp = en_core_web_sm.load() def normalize(message): doc = nlp(message) normalized = doc normalized = filter(lambda t : t.is_alpha and not t.is_stop, doc) normalized = map(lambda t : t.text, normalized) normalized = list(normalized) return normalized messages_normalized = list(map(normalize, messages.data)) with open(self.file_name, 'wb') as f: pickle.dump(messages_normalized, f) with open(self.file_name, 'rb') as f: messages_normalized = pickle.load(f) samples = messages_normalized labels = messages.target self.classes = list(set(labels)) self.classes.sort() #Otherwise it is inconsistent which class is 0 and which is 1 self.oh_classes = MultiLabelBinarizer() self.oh_classes.fit([[label] for label in self.classes]) self.classes = dict([(l,i) for (i,l) in enumerate(self.oh_classes.classes_)]) return samples, labels def __len__(self): return len(self.samples) """Returns (item as matrix of token vectors, label, index) instead of the normal (item, label). See constructor for more info""" def __getitem__(self, index): msg = self.samples[index] label = self.classes[self.labels[index]] #Convert label to integer. Is this slow? oh_label = self.oh_classes.transform([[self.labels[index]]]) oh_label = oh_label[0] #mat = np.zeros( (self.maxlength, self.ftmodel.numpy_normalized_vectors.shape[1]) ) mat = np.zeros( (self.maxlength, 300) ) #Hard coded for the wikipedia data that has dim=300 for i, token in enumerate(msg): if i >= self.maxlength: break if not emb.containsWord(token): continue v = emb.getVector(token) mat[i,:] = v return ( torch.tensor(mat), label, oh_label, index ) def get_sample(self, index): return self.samples[index]
# val_ids=val_ids.merge(labels,on='photo_id',how='inner') mlb=LabelEncoder() mlb.fit(train_ids['business_id'].tolist()) # X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32) # X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32) return train_ids,mlb def load_train(train_list): return(np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_list]).astype(np.float32)/255.0) train_ids,mlb=load_data() labels=pd.read_csv("train.csv") labels=labels[pd.isnull(labels['labels'])==False].reset_index(drop=True) labels['assignment']=np.random.uniform(size=(labels.shape[0],1)) MLB=MultiLabelBinarizer() train_ids=train_ids.merge(labels[['business_id','assignment']],on='business_id',how='left') MLB.fit(train_ids['labels'].tolist()) labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) BETA=MLB.transform(labels.sort('business_id')['labels']) val_ids=train_ids[train_ids['assignment']>=.9].reset_index(drop=True) val_Y=MLB.transform(val_ids['labels']) train_ids=train_ids[train_ids['assignment']<.9].reset_index(drop=True) Y_test=mlb.transform(val_ids['business_id'].tolist()) print Y_test.shape np.random.seed(42) #train_ids=train_ids.sort('business_id').reset_index(drop=True) train_ids.reindex(np.random.permutation(train_ids.index)) val_ids.reindex(np.random.permutation(val_ids.index)) validate=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()[0:10000]]).astype(np.float32)/255.0 datagen = ImageDataGenerator( featurewise_center=True,
class Evaluator(object): def __init__(self, config, model): self.config = config if config.model_name.endswith("flat"): self.n_classes = config.fn_classes else: self.n_classes = config.hn_classes self.model = model self.loss = model.loss self.logits = model.logits self.mlb = MultiLabelBinarizer() if not self.config.model_name.endswith("flat"): self.preds = model.preds self.scores = model.scores if config.data_from == "reuters": self.mlb.fit([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ]]) elif config.data_from == "20newsgroup": self.mlb.fit([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 ]]) elif config.data_from == "ice": hcl_ids = [_ for _ in range(self.config.EOS + 1)] self.mlb.fit([hcl_ids]) else: if config.data_from == "reuters": self.mlb.fit([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 ]]) # for transfrom preds elif config.data_from == "20newsgroup": self.mlb.fit([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ]]) elif config.data_from == "ice": hcl_ids = [_ for _ in range(0, self.config.EOS - 2)] self.mlb.fit([hcl_ids]) def get_metric(self, preds, labels, average=None, about="all", data_type="dev"): precisions, recalls, fscores, _ = precision_recall_fscore_support( labels, preds, average=average) if about == "all": print('%s average precision recall f1-score: %f %f %f' % (average, precisions, recalls, fscores)) f1_summary = tf.Summary(value=[ tf.Summary.Value(tag='{}:{}:{}/f1'.format(data_type, about, average), simple_value=fscores) ]) self.summaries.append(f1_summary) def get_evaluation(self, sess, batch): batch_idx, batch_ds = batch feed_dict = self.model.get_feed_dict(batch, False) if self.config.model_name.endswith("flat"): test_size = batch_ds.get_data_size() logits, loss = sess.run([self.model.prob, self.loss], feed_dict=feed_dict) # print("logits:", logits) preds = np.array([[i for i in range(self.n_classes)] for _ in range(test_size)]) # print("preds:", preds.shape) preds = prediction_with_threshold(self.config, preds, logits, threshold=self.config.thred) print("preds:", preds[0:2]) preds = self.mlb.transform(preds) if self.config.data_from == "20newsgroup": labels = batch_ds.data["y_f"] elif self.config.data_from == "reuters": labels = batch_ds.data["y_seqs"] elif self.config.data_from == "ice": labels = batch_ds.data["y_f"] if self.config.data_from != "ice": labels = self.mlb.transform(labels) return preds, labels else: preds, scores = sess.run([self.preds, self.scores], feed_dict=feed_dict) # print("check eval:", preds[0,:], scores[0,:], preds.shape, scores.shape) # why test is not fixed? cause keep_prob preds = prediction_with_threshold(self.config, preds, scores, threshold=self.config.thred) preds_log = preds preds = self.mlb.transform(preds) if self.config.data_from == "ice": labels = batch_ds.data["y_h"] labels_log = labels labels = self.mlb.transform(labels) elif self.config.data_from == "20newsgroup": labels = batch_ds.data["y_h"] labels_log = labels else: labels = batch_ds.data["y_seqs"] labels_log = labels print("check eval:", "\n", preds_log[0:3], "\n", labels_log[0:3]) return preds, labels def get_evaluation_from_batches(self, sess, batches): config = self.config elist = [self.get_evaluation(sess, batch) for batch in batches] preds = [elem[0] for elem in elist] labels = [elem[1] for elem in elist] preds = np.concatenate(preds, axis=0) labels = np.concatenate(labels, axis=0) # print("preds, labels:", preds[0,:], labels[0,:], len(preds[0,:]), len(labels[0,:])) return Evaluation(config, preds, labels)
chunks = [] for chunk in reader: chunk.dropna(inplace=True) chunks.append(chunk) test = pd.concat(chunks) del(chunks) # Split the tags by spaces train_labels = train['Tags'].map(lambda x: x.split()) test_labels = test['Tags'].map(lambda x: x.split()) # The label binarizer takes all the tags and turns them into a big sparse matrix mlb = MultiLabelBinarizer() mlb.fit(pd.concat([train_labels, test_labels])) labels = mlb.transform(train_labels) # Turn the tokens into a sparse matrix vect = CountVectorizer( # Get text from html preprocessor = preprocess, # Turn the text into tokens tokenizer = tokenize, # Generate ngrams ngram_range = (1, 2), # Remove extremely common tokens max_df = 0.5, # Remove extremely uncommon tokens min_df = 0.001 )
query = 'SELECT DISTINCT SUBSTRING_INDEX(SUBSTRING_INDEX(t.components, \',\', n.n), \',\', -1) value ' \ 'FROM porru_dataset.' + project + ' t CROSS JOIN ' \ '(' \ 'SELECT a.N + b.N * 10 + 1 n ' \ 'FROM ' \ '(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a ' \ ',(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b ' \ 'ORDER BY n ' \ ') n ' \ 'WHERE n.n <= 1 + (LENGTH(t.components) - LENGTH(REPLACE(t.components, \',\', \'\'))) ' \ 'ORDER BY value' cursor.execute(query) componentList = numpy.array(cursor.fetchall()) mlb = MultiLabelBinarizer() mlb.fit(componentList) print 'No. of label: ' + str(len(mlb.classes_)) componentLabel = [None] * len(data) for i in range(len(data)): componentLabel[i] = data[i, 5].split(',') componentBinary = mlb.transform(componentLabel) # train MultiLabelBinarizer for Type query = 'SELECT DISTINCT type FROM porru_dataset.' + project cursor.execute(query) typeList = numpy.array(cursor.fetchall()) mlb_t = MultiLabelBinarizer() mlb_t.fit(typeList) print 'No. of label: ' + str(len(mlb_t.classes_)) typeLabel = [None] * len(data) for i in range(len(data)):
a_=[loadbusimage(im_) for im_ in x] return(np.array(a_)) labels=pd.read_csv("train.csv") labels=labels[pd.isnull(labels['labels'])==False] bismatch=pd.read_csv("train_photo_to_biz_ids.csv") photo_labels=bismatch.merge(labels,how='left',on='business_id') photo_labels=photo_labels[pd.isnull(photo_labels['labels'])==False] photo_labels['labels']=photo_labels['labels'].map(lambda x:[int(i) for i in x.split(" ")]) np.random.seed(42) labels['assignment']=np.random.randint(0,10,size=(labels.shape[0],1)) photo_labels=photo_labels.merge(labels[['business_id','assignment']],on='business_id') train=photo_labels[photo_labels['assignment']<=7].reset_index(drop=True) test=photo_labels[photo_labels['assignment']>7].reset_index(drop=True) mlb=MultiLabelBinarizer() mlb.fit(train['labels'].tolist()+test['labels'].tolist()) #INSERT NORMALIZATION TRAINING HERE n_images=10 graph = Graph() nfilters=32 for i in xrange(0,n_images): graph.add_input(name="input"+str(i),input_shape=(3,size,size)) graph.add_shared_node(Convolution2D(nfilters, 3, 3, border_mode='same',activation='relu'),name='conv1',inputs=["input"+str(i) for i in xrange(0,10)]) graph.add_shared_node(BatchNormalization(),name='batch1',inputs=['conv1']) graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv2', inputs=['batch1']) graph.add_shared_node(BatchNormalization(),name='batch2',inputs=['conv2']) graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv3', inputs=['batch2']) graph.add_shared_node(BatchNormalization(),name='batch3',inputs=['conv3']) graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv4', inputs=['batch3']) graph.add_shared_node(BatchNormalization(),name='batch4',inputs=['conv4']) graph.add_shared_node(Convolution2D(nfilters,3,3,activation=LeakyReLU()), name='conv5', inputs=['batch4'])
import tokenization from extract_features import InputExample, convert_examples_to_features import numpy as np import requests,os,json,time import tensorflow as tf from sklearn.preprocessing import MultiLabelBinarizer from config import Config config = Config() """ 1: 准备多标签处理工具,用于将概率转为文本标签 """ all_labels = open(config.class_path,encoding="utf-8").readlines() all_labels = [label.strip() for label in all_labels] mlb = MultiLabelBinarizer() mlb.fit([[label] for label in all_labels]) """2: 初始化tokenizer,用于文本到id的转换""" vocab_file = os.environ.get('vocab_file', './pretrained_model/roberta_zh_l12/vocab.txt') max_token_len = os.environ.get('max_token_len', 400) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=True) tf.app.flags.DEFINE_string('server', '0.0.0.0:8500', 'PredictionService host:port') FLAGS = tf.app.flags.FLAGS """3: 将样本转化为符合的输入格式""" def preprocess(text):
class PCXRayDataset(Dataset): def __init__(self, datadir, csvpath, splitpath=None, transform=None, views=["PA","L"], dataset='train', pretrained=False, min_patients_per_label=50, counter_examples=False, duplicate=False): """ Data reader. Only selects labels that at least min_patients_per_label patients have. """ super(PCXRayDataset, self).__init__() assert dataset in ['train', 'val', 'test'] self.datadir = datadir self.transform = transform self.pretrained = pretrained self.threshold = min_patients_per_label self.views = views self.counter_examples = counter_examples self.duplicate = duplicate self.df = pd.read_csv(csvpath) self.total_samples = len(self.df.PatientID.unique()) self._build_labels() self.mb = MultiLabelBinarizer(classes=self.labels) self.mb.fit(self.labels) # Split into train or validation if splitpath is not None: with open(splitpath, 'rb') as f: train_ids, val_ids, test_ids = pickle.load(f) if dataset == 'train': self.df = self.df[self.df.PatientID.isin(train_ids)] elif dataset == 'val': self.df = self.df[self.df.PatientID.isin(val_ids)] else: self.df = self.df[self.df.PatientID.isin(test_ids)] self.df = self.df.reset_index() self.nb_classes = len(self.labels) if self.duplicate: self.to_duplicate = np.random.choice(range(self.total_samples), self.total_samples//2) self.nb_classes = self.nb_classes*2 def __len__(self): return self.total_samples def get_labels(self, idx): subset = self.df[self.df.PatientID == self.df.PatientID[idx * 2]] labels = eval(subset.Clean_Labels.tolist()[0]) if set(labels).difference(self.labels): labels.append('other') labels = [l for l in labels if l in self.labels] return labels def __getitem__(self, idx): subset = self.df[self.df.PatientID == self.df.PatientID[idx * 2]] labels = self.get_labels(idx) encoded_labels = self.mb.transform([labels]).squeeze() sample = {} if "PA" in self.views: pa_path = subset[subset.Projection == 'PA'][['ImageID', 'ImageDir']] pa_path = join(self.datadir, pa_path['ImageID'].tolist()[0]) #pa_path = join(self.datadir,'216840111366964012989926673512011108125227151_00-185-152.png') pa_img = np.array(Image.open(pa_path))[..., np.newaxis] if self.pretrained: pa_img = np.repeat(pa_img, 3, axis=-1) sample["PA"] = pa_img if "L" in self.views: l_path = subset[subset.Projection == 'L'][['ImageID', 'ImageDir']] l_path = join(self.datadir, l_path['ImageID'].tolist()[0]) # l_path = './data/processed/0/46523715740384360192496023767246369337_veyewt.png' l_img = np.array(Image.open(l_path))[..., np.newaxis] if self.pretrained: l_img = np.repeat(l_img, 3, axis=-1) sample["L"] = l_img if self.transform is not None: sample = self.transform(sample) sample['labels'] = labels sample['encoded_labels'] = torch.from_numpy(encoded_labels.astype(np.float32)) sample['sample_weight'] = torch.max(sample['encoded_labels'] * self.labels_weights) if self.counter_examples: if self.duplicate: new_encoded_labels = torch.zeros(self.nb_classes).long() # put the labels at the lower or upper copy if idx in self.to_duplicate: new_encoded_labels[:self.nb_classes//2] = sample['encoded_labels'] else: new_encoded_labels[self.nb_classes//2:] = sample['encoded_labels'] sample['encoded_labels'] = new_encoded_labels # pick a query label topresent = np.random.choice(range(len(sample['encoded_labels'])), p=self.labels_weights_dup) sample['cond'] = torch.LongTensor([topresent]).squeeze() # is the query in this sample? sample['cond_target'] = sample['encoded_labels'][topresent].long().squeeze() sample['cond_weight'] = self.labels_weights[sample['cond']//2] return sample["PA"], sample['cond_target'], sample['cond'], sample['cond_weight'] def _build_labels(self): labels_dict = {} for labels in self.df.Clean_Labels: for label in eval(labels): label = label.strip() if label not in labels_dict: labels_dict[label] = 0 labels_dict[label] += 1 labels = [] labels_count = [] other_counts = [] for k, v in labels_dict.items(): if v > self.threshold * 2: labels.append(k) labels_count.append(v) else: other_counts.append(v) labels.append('other') labels_count.append(sum(other_counts)) self.labels = labels self.labels_count = labels_count self.labels_weights = torch.from_numpy(np.array([(len(self) / label) for label in labels_count], dtype=np.float32)) self.labels_weights = self.labels_weights/self.labels_weights.max() self.labels_weights = self.labels_weights**2 self.labels_weights_dup = torch.cat([self.labels_weights]*2).numpy() self.labels_weights_dup /= self.labels_weights_dup.sum() #self.labels_weights = torch.clamp(self.labels_weights * 0.1, 1., 5.) self.nb_labels = len(self.labels)
def createMLB(): labels_set = get_labels_set() mlb = MultiLabelBinarizer() mlb.fit(labels_set) return mlb