def main(): """Do something with the project code! Have fun :) """ preprocessor = Preprocessor() preprocessor.import_labeled_data("data/train.csv") X_total, t_total = preprocessor.encode_labels(use_new_encoder=True) X_train, X_test, t_train, t_test = preprocessor.train_test_split( X_total, t_total) # transform data and overwrite non-transformed data X_train_scaled = preprocessor.scale_data(X_train, use_new_scaler=True) X_test_scaled = preprocessor.scale_data(X_test, use_new_scaler=False) # apply PCA X_train = preprocessor.apply_pca(X_train_scaled, use_new_pca=True, n_components=27, whiten=False) X_test = preprocessor.apply_pca(X_test_scaled, use_new_pca=False) clf = NaiveBayes(X_train, t_train) clf.optimize_hyperparameters() clf.display_general_validation_results() clf.display_cv_results() print("Test accuracy : {:.03f}".format(clf.get_accuracy(X_test, t_test))) print("Test f1-score : {:.03f}".format(clf.get_f1_score(X_test, t_test))) label_predictions = make_new_predictions("data/test.csv", preprocessor, clf)
def main(): # Directory where this script is located dirname = os.path.dirname(__file__) # List of terms to be ignored by the tokenizer ignore_terms = [] # Collect the terms we want to ignore for ignore_file_name in IGNORE_TERMS_FILE_NAMES: with open(os.path.join(dirname, ignore_file_name)) as file: ignore_terms.extend(term.strip() for term in file) # Create our custom tokenizer, it receives the terms we want to ignore preprocessor = Preprocessor(word_chars='a-zA-Z0-9', inter_chars="'", min_length=3, ignore=ignore_terms) for line in sys.stdin: bug_report = json.loads(line) old_title = bug_report['title'] old_description = bug_report['description'] bug_report['title'] = ' '.join(preprocessor.preprocess(old_title)) bug_report['description'] = ' '.join( preprocessor.preprocess(old_description)) print(json.dumps(bug_report))
def predict(name, command): command = command.lower() label_path = path.join(path.dirname(path.realpath(__file__)), "intents", "config", "labels", "%s_labels.json" % name) with open(label_path, encoding="utf8") as f: labels = json.load(f) word_vocab = Vocabulary() word_vocab.load("%s_word_vocab.json" % name) #char embedding char_vocab = Vocabulary() char_vocab.load("%s_char_vocab.json" % name) idx2label = dict((idx, label) for idx, label in enumerate(labels)) preprocessor = Preprocessor(word_vocab, None, char_vocab) model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab)) model.load_weights('intents/config/weights/%s.hdf5' % name) sentence = tokenize(command) features = preprocessor.transform([sentence]) p = model.predict(features) predicted_labels = [] for pred in p: predicted_labels.append(idx2label[pred]) for word, label in zip(sentence, predicted_labels): print('%s: %s' % (word, label))
def __init__(self): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' self.num_train_samples=0 self.num_feat=1 self.num_labels=1 self.is_trained=False '''Ici quelques classifieurs téstés''' #self.model = clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr') #self.model = clf = GaussianNB() #self.model = clf = KNeighborsClassifier() #self.model = clf = QuadraticDiscriminantAnalysis() #self.model = clf = RandomForestClassifier(n_estimators= 80 , max_depth= 20, max_features= 'sqrt') #self.model = clf = Pipeline([('preprocessing', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))]) #self.model = clf = Pipeline([('SelectKBest', Preprocessor2()),('PCA', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))]) '''Ici, nous testons trois classifier que nous mettons en competition.''' fancy_classifier1 = MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu') fancy_classifier2 = Pipeline([('preprocessing', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))]) fancy_classifier3 = Pipeline([('SelectKBest', Preprocessor2()),('PCA', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))]) self.model = clf = VotingClassifier(estimators=[('Fancy Classifier1', fancy_classifier1),('Fancy Classifier2', fancy_classifier2),('Fancy Classifier3', fancy_classifier3)],voting='soft')
class Pipeline(BaseEstimator, TransformerMixin): """ """ def __init__(self, numeric, id=None, target=None, categorical=None, verbose=0): self.created_features = None self.id = id self.target = target self.categorical = categorical self.numeric = numeric self.verbose = verbose self.feature_generator = None self.preprocessor = None def fit_transform(self, df, y=None, **fit_params): with Timer('pipelines.Pipeline.fit_transform:', self.verbose): self.feature_generator = FeatureGenerator( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) df_features = self.feature_generator.fit_transform(df) self.preprocessor = Preprocessor( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) x = self.preprocessor.fit_transform(df_features) return x def transform(self, df): with Timer('pipelines.Pipeline.transform:', self.verbose): if self.feature_generator is None: raise NotFittedError( f'feature_generator = {self.feature_generator}') if self.preprocessor is None: raise NotFittedError(f'preprocessor = {self.preprocessor}') df_features = self.feature_generator.transform(df) x = self.preprocessor.transform(df_features) return x def fit(self, x, y=None, **fit_params): return self def get_feature_names(self): return self.created_features
def test_preprocessing(self): """Test full preprocessing pipeline.""" data_path = '../data' prepper = Preprocessor(data_path) prepper.execute() files = os.listdir(data_path + '/') assert 'm5.db' in files
def __init__(self, max_comments_per_subreddit=100000): self.getter = CommentGetter() self.preprocessor = Preprocessor() self.max_comments_per_subreddit = max_comments_per_subreddit # if load_from_files: # self.dictionary = Dictionary( # wordToIndexDict=pickler.loadData('wordToIndexDict'), # indexToWordDict=pickler.loadData('indexToWordDict')) # else: # self.dictionary = Dictionary() self.dictionary = Dictionary()
def preprocess_data(vocab_size, batch_size, num_workers=0, max_input_len=400, max_target_len=100): p = Preprocessor(chosen_dataset) print('preprocessing started') train_set, test_set, validation_set = p.create_data_loaders(vocab_size, batch_size, num_workers=num_workers, max_input_len=max_input_len, max_target_len=max_target_len) print('preprocessing finished') return p, train_set, test_set, validation_set
def preprocess_signals(data: pd.DataFrame, args: Namespace) -> pd.DataFrame: with Cache.configure(args.currency_pair, args.tick_rate): signal_strategy = SignalStrategyFactory.get( 'ma', **signal_strat_argument_parser(args)) stopping_strat_argument_parser(args) stop_strategy = StoppingStrategyFactory.get( args.stopping_strat, **stopping_strat_argument_parser(args)) preprocessor = Preprocessor(signal_strategy, stop_strategy) if args.no_cache: return preprocessor.find_signals(data) else: return preprocessor.get_signals(data)
def test_get_emoji_score(self): prep5 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=False) self.assertEqual(prep5.get_emoji_score(0)["positive"], 2) self.assertEqual(prep5.get_emoji_score(0)["negative"], 0) self.assertEqual(prep5.get_emoji_score(1)["positive"], 0) self.assertEqual(prep5.get_emoji_score(1)["negative"], 1) self.assertEqual(prep5.get_emoji_score(2)["positive"], 0) self.assertEqual(prep5.get_emoji_score(2)["negative"], 0) prep5 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=True) self.assertEqual(prep5.get_emoji_score(0)["positive"], 2) self.assertEqual(prep5.get_emoji_score(0)["negative"], 0) self.assertEqual(prep5.get_emoji_score(1)["positive"], 0) self.assertEqual(prep5.get_emoji_score(1)["negative"], 1) self.assertEqual(prep5.get_emoji_score(2)["positive"], 0) self.assertEqual(prep5.get_emoji_score(2)["negative"], 0)
def test_keras_feature_extractor_extract_features(): ext = KerasFeatureExtractor(TEST_NET_ID, ckpt_path=TEST_CKPT_PATH) ds = Dataset(name=TEST_NAME, prefix=TEST_PREFIX, batch_size=8) ds.initialize(fp=TEST_SOURCES) ds.load_images() imgs = [e.image for e in ds.elements] prepro = Preprocessor() imgs = prepro.preprocess_images(imgs) result = ext.extract_features(images=imgs) assert isinstance(result, np.ndarray) == True assert len(result) == ds.count
def main(): args = parse() corpus = Preprocessor(args.directory).run() lengths: Dict[str, int] = { "<50": 0, "50-99": 0, "100-199": 0, "200-399": 0, "400+": 0 } for blog, _ in corpus: word_count = len(blog) if word_count <= 50: lengths["<50"] += 1 elif word_count <= 99: lengths["50-99"] += 1 elif word_count <= 199: lengths["100-199"] += 1 elif word_count <= 399: lengths["200-399"] += 1 else: lengths["400+"] += 1 # min, max lengths["min"] = min((len(blog) for blog, _ in corpus)) lengths["max"] = max((len(blog) for blog, _ in corpus)) with open(os.path.join(args.results, "lengths.json"), "w") as file: json.dump(lengths, file, indent=4, ensure_ascii=True)
def __init__(self, classifier=RandomForestClassifier(n_estimators=180, max_depth=None, max_features='auto')): print("CONSTRUCTEUR MODELE") self.classifierUsed = classifier self.preprocess = Preprocessor() #self.clf = classifier PipelineUse = Pipeline([('preprocessing', self.preprocess), ('classification', self.classifierUsed)]) self.clf = VotingClassifier(estimators=[ ('Gradient Tree Boosting', ensemble.GradientBoostingClassifier()), ('Pipeline', PipelineUse), ('RandomForestClassifier', RandomForestClassifier(n_estimators=180, max_depth=None, max_features='auto')) ], voting='soft') self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False
def run(schema_path, name, sample_size, batch_size, epochs): dataset = Dataset(schema_path, name) labels, data = dataset.get_data() X = [x['words'] for x in data] y = [x['labels'] for x in data] word_vocab = Vocabulary() word_vocab.build_vocab([w for command in X for w in command]) #char embedding char_vocab = Vocabulary() char_vocab.build_vocab([ch for w in word_vocab for ch in w]) labels2idx = dict((label, idx) for idx, label in enumerate(labels)) idx2label = dict((idx, label) for idx, label in enumerate(labels)) preprocessor = Preprocessor(word_vocab, labels2idx, char_vocab) model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab)) trainer = Trainer(model, X, y, preprocessor.transform, split=[0.75, 0.95]) trainer.train(batch_size, epochs) trainer.evaluate(idx2label) model.save_weights(name) dataset.save(X[:sample_size], labels) word_vocab.save("%s_word_vocab.json" % name) char_vocab.save("%s_char_vocab.json" % name)
def evaluate_decision(self, model_data): """ Using the results from the model_decision, prompt the user and decide what to do next. """ from preprocessing import Preprocessor from feature_selection import FeatureSelector from sklearn.pipeline import Pipeline preprocess = Preprocessor(self) feature_select = FeatureSelector(self) model_name, model, model_performance = model_data steps = [ ('preprocess', preprocess), ('feature_select', feature_select), ('clf', model) ] pipeline = Pipeline(steps) # this is our classifier pipeline that transforms data and makes predictions. metric = self.parameters['final_min_performance_metric'] model_performance = model_performance[metric] min_performance = self.parameters['final_min_performance'] if model_performance > min_performance: print('Minimum performance required:', min_performance, metric) print('Model performance', model_performance, metric) print('The model meets minimum requirements!') deploy = input('Type "C" to cancel, or type anything else to save the model: ') if deploy.strip().lower() != 'c': file_name = input('Enter file name:') # save the model so it can be easily loaded next time and used to make predictions. self.file_handler.save_model(file_name, pipeline)
def predictFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = Data_Getter_Prediction(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) data = preprocessor.remove_columns(data, cols_to_drop) file_loader = File_operation(self.file_object, self.log_writer) model = file_loader.load_model('my_model') X, y = preprocessor.separate_label_feature(data, 'Calories') result = list(model.predict(X.values)) result = pd.Series(result, name='Predictions') path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def trainingModel(self): self.log_writer.log(self.file_object, 'Start of Training') try: data_getter = Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) X, Y = preprocessor.separate_label_feature( data, label_column_name='Calories') is_null_present = preprocessor.is_null_present(X) if (is_null_present): X = preprocessor.impute_missing_values(X) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) X = preprocessor.remove_columns(X, cols_to_drop) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=1 / 3, random_state=355) model_finder = Model_Finder(self.file_object, self.log_writer) best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) file_op = File_operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name) self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def prepare_nn_patterns(lemmatizer: Preprocessor): try: with open('nn_patterns.pickle', 'rb') as handle: nn_patterns = pickle.load(handle) except: nn_patterns = lemmatizer.lemmatize_all_patterns() with open('nn_patterns.pickle', 'wb') as handle: pickle.dump(nn_patterns, handle, protocol=pickle.HIGHEST_PROTOCOL) return nn_patterns
def __init__(self, preprocessing_chain_after_squaring, preprocessing_chain_after_flattening, preprocessing_options, training_set, **kwargs): self.preprocessor = Preprocessor(preprocessing_chain_after_squaring, preprocessing_chain_after_flattening, preprocessing_options, training_set) self.kwargs = kwargs self.__dict__.update(kwargs) if training_set is not None: self.fit(*training_set)
def stem(self, text, poetic_preprocessing=False, remove_tek=False, tek_string=None): preprocessor = Preprocessor() text = preprocessor.compulsory_preprocessing(text) if poetic_preprocessing: text = preprocessor.poetic_preprocessing(text, remove_tek=remove_tek, tek_string=tek_string) l = SentenceTokenizer(text) if len(l) == 1: sentence = l[0] return self.stem_word(sentence) else: a = [] for sentence in l: a.append(self.stem(sentence)) return a
def __init__(self): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' self.num_train_samples = 121499 self.num_feat = 56 #attributs self.num_labels = 1 #classes self.is_trained = False self.clf = Pipeline([('preprocessor', Preprocessor()), ('class', SGDClassifier())])
def test(self): mnist = input_data.read_data_sets('MNIST_data', one_hot=True) test_results = self.out_image.eval( feed_dict={ self.x: mnist.test.images, self.y_: mnist.test.labels, self.keep_prob: 1.0 }) combined_images = np.zeros( (0, 56)) # Empty array of 'correct' dimensions for concatenation for i in range(10): test_image = np.array(test_results[i]).reshape((28, 28)) test_image = self.post_process(test_image) actual_image = np.array(mnist.test.images[i]).reshape( (28, 28)) * 255 actual_image = np.rot90(actual_image) # Stack output image with actual horizontally, for comparison image_column = np.hstack((test_image, actual_image)) combined_images = np.vstack((combined_images, image_column)) Preprocessor.displayImage(combined_images)
def preprocess_data_split(split, output_folder): # 1. Create dataframes dataframes = _get_dataframes(split) # 2. Preprocessing pipeline processed = Preprocessor().preprocess(dataframes, WORD_FREQ_FILE_PATH) # 3. Convert to spacy spacy_format = Converter().to_spacy(processed) # 4. Save the properly formatted output output_file_path = os.path.join(output_folder, '{}_doc'.format(split)) with open(output_file_path, 'w+') as f: json.dump(spacy_format, f)
def fit_transform(self, df, y=None, **fit_params): with Timer('pipelines.Pipeline.fit_transform:', self.verbose): self.feature_generator = FeatureGenerator( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) df_features = self.feature_generator.fit_transform(df) self.preprocessor = Preprocessor( id=self.id, numeric=self.numeric, categorical=self.categorical, target=self.target, verbose=self.verbose, ) x = self.preprocessor.fit_transform(df_features) return x
def __init__(self, train_path, test_path): self.train_path = train_path self.test_path = test_path self.preprocessor = Preprocessor() self.trn = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.tst = pd.DataFrame(columns=Classifier._COLS) # Read data_frame self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS) # Known labels self.tok_trn = [] self.tok_tst = [] self.feature_extractor = FeatureExtractor() self.jaccard = Jaccard() self.rfr = RFR() self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30), validation_fraction=0.3, alpha=0.3, warm_start=False, max_iter=1000, activation='logistic')
def generate_intervals(): # Script to generate a given number of frequency intervals given # a frequent items output from Borgelt intervals = 30 output_folders = [] for interval, res in enumerate(Preprocessor.triple_intervals('../tmp/observed_frequent_items.out', intervals=intervals)): # Triple set of 1/intervals part of the data interval_id = 'interval_' + str(intervals) + '_' + str(interval) output_folder = cross_validate_disc_version(env.BORGELT_ALGORITHM, env.AOL_MERGED_FILE, sample_pct=-100000, iterations=2, restricted_triples=res, extra_id=interval_id, min_support=-30) output_folders.append(output_folder) print 'output folders: ', output_folders
def __init__(self, learning_rate=1e-4, dataset_name="offices"): self.x = tf.placeholder(tf.float32, shape=[None, 304, 228, 3]) self.y_ = tf.placeholder(tf.float32, shape=[None, 74, 55]) self.learning_rate = learning_rate self.weights = dict() self._create_network() print("Creating session and intitalizing weights... ") self.sess = tf.InteractiveSession() self._initialize_weights() self.saver = tf.train.Saver(self.weights) print("Done initializing session.") # Overwrite this when we load model with global step value # Increased to 0 when training, so first step is first image in batch self.step = -1 # Load dataset into memory prior to training / testing print("Loading dataset and batching unit... ") self.pp = Preprocessor(dataset_name, greyscale=False) print("Done loading dataset.")
def test_search_lanes(): # img_path = '../test_output_folder/bin_img.jpg' img_path = '../test_images/straight_lines1.jpg' img = cv2.imread(img_path) preprocessor = Preprocessor() pimg = preprocessor.preprocess_image(img) bimg, _ = color_n_edge_threshold(pimg) lanedetector = LaneDetector() # search_lanes(img,self.is_first_pass=False,l=l,r=r); # plt.subplot(121); # plt.title("Original Image") # plt.imshow(img[:,:,::-1]); # # preprocessed_img = preprocess_image(img); # plt.subplot(122); plt.title("Visulized Image") plt.imshow(bimg, cmap='gray') l, r, vis_img = lanedetector.search_lanes(bimg, is_first_pass=True)
def test_reduce_repeated_letters_corpus(self): prep6 = Preprocessor(["Teeest test"], remove_short_tweets=False, reduce_chars=True) corpus = prep6.preprocessed_corpus()[0] self.assertEqual(corpus[0], ["test", "test"]) prep6 = Preprocessor(["Teeest test"], remove_short_tweets=False, reduce_chars=False) corpus = prep6.preprocessed_corpus()[0] self.assertEqual(corpus[0], ["teeest", "test"])
def _build(self, classifier_model, X, y=None): """ Inner build function that builds a single model. """ model = Pipeline([ ('preprocessor', Preprocessor()), ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2))), ('classifier', classifier_model), ]) model.fit(X, y) return model
def train(self): print(path) if self.path == 'data/bank-additional-full.csv': data = pd.read_csv('data/bank-additional-full.csv', sep=';') data['y'] = data['y'].map({'yes': 1, 'no': 0}) else: data = pd.read_csv('path') process = Preprocessor(data) data, columns = process._divide_data(data, self.label) categorical, numerical = process._classify_data(columns, data) transformed, y = process._preprocess_data(categorical, numerical, preprocessor=scaler) #dealing with our imbalanced data by oversampling the data set model = Modelling() x_train, x_test, y_train, y_test = model._splitdata(transformed, y, size=self.validate) X, y = DealwithSample(x_train, y_train, method=sampler) model = model.Prediction(X, x_test, y, y_test, method=self.model)
def main(): """ Invoke this module as a script. """ args = parse_args() _set_up_logging(args) random.seed(a=args.seed) preprocessor = Preprocessor(lowercase=args.lowercase, unknown_label_id=0, ngram_range=args.ngram_range) pooling_classifier = PoolingClassifier(max_epochs=args.max_epochs, validation_interval=args.validation_interval, validation_metric=args.validation_metric, early_stop=args.early_stop, early_stop_patience=args.early_stop_patience, early_stop_tol=args.early_stop_tol, word_embedding_size=args.word_embedding_size, pooling_method=args.pooling_method, average_dropout=args.average_dropout, random_state=args.seed, model_path=args.model_path) _, _, _, X, y = preprocessor.preprocess_file(file_path=args.train_file, with_labels=True, min_seq_length=args.min_seq_length, max_seq_length=args.max_seq_length) pooling_classifier.fit(X, y) logger.info("Preparing test file.") # prepare test file entries into X_test ids, X_texts, _, X, y = preprocessor.preprocess_file(file_path=args.test_file, with_labels=False, fit=False) logger.info("Predicting test file.") # make predictions for entries in test file y_probs = pooling_classifier.predict_proba(X) y_label_ids = pooling_classifier.predict(X) # converting label ids back to string labels y_labels = preprocessor.inverse_transform_labels(y_label_ids) preprocessor.write_file(file_path=args.model_path + ".test.predictions.csv", ids=ids, X_texts=X_texts, y_probs=y_probs, y_labels=y_labels, verbose=False)
ax2.set_title(filter_name) ax2.axis('off') ax2.set_adjustable('box-forced') if __name__ == "__main__": # Settings box_size = 80 scale_factor = 0.8 mask_scale = 0.2 plot = False box_size *= scale_factor # Load Preprocessor print("Preprocessing") p = Preprocessor("../images/slum_image.jpg") p.scale_image(scale_factor) p.exposure_equalization(method="equal") p.convert_color("RGB","HSV") p.save_current_as("structure") p.reset() p.scale_image(mask_scale) p.exposure_equalization(method="equal") p.convert_color("RGB","HSV") p.save_current_as("mask") # Load images for mask and structure information img2 = p.get_version("mask")[:,:,0] img = p.get_version("structure")[:,:,2]
class Tagger(): def __init__(self, config_path=None, nb_encoding_layers = 1, nb_dense_dims = 30, batch_size = 100, nb_left_tokens = 2, nb_right_tokens = 2, nb_embedding_dims = 150, model_dir = 'new_model', postcorrect = True, include_token = True, include_context = True, include_lemma = True, include_pos = True, include_morph = True, include_dev = True, include_test = True, nb_filters = 100, filter_length = 3, focus_repr = 'recurrent', dropout_level = .1, load = False, nb_epochs = 15, min_token_freq_emb = 5, halve_lr_at = 10, max_token_len = None, min_lem_cnt = 1, ): if load: if model_dir: self.config_path = os.sep.join((model_dir, 'config.txt')) else: raise ValueError('To load a tagger you, must specify model_name!') else: self.config_path = config_path if not config_path and not load: self.nb_encoding_layers = int(nb_encoding_layers) self.nb_dense_dims = int(nb_dense_dims) self.batch_size = int(batch_size) self.nb_left_tokens = int(nb_left_tokens) self.nb_right_tokens = int(nb_right_tokens) self.nb_context_tokens = self.nb_left_tokens + self.nb_right_tokens self.nb_embedding_dims = int(nb_embedding_dims) self.model_dir = model_dir self.postcorrect = bool(postcorrect) self.nb_filters = int(nb_filters) self.filter_length = int(filter_length) self.focus_repr = focus_repr self.dropout_level = float(dropout_level) self.include_token = include_token self.include_context = include_context self.include_lemma = include_lemma self.include_pos = include_pos self.include_morph = include_morph self.include_dev = include_dev self.include_test = include_test self.min_token_freq_emb = min_token_freq_emb self.nb_epochs = int(nb_epochs) self.halve_lr_at = int(halve_lr_at) self.max_token_len = int(max_token_len) self.min_lem_cnt = int(min_lem_cnt) else: param_dict = utils.get_param_dict(self.config_path) print('Using params from config file: ', param_dict) self.nb_encoding_layers = int(param_dict['nb_encoding_layers']) self.nb_epochs = int(param_dict['nb_epochs']) self.nb_dense_dims = int(param_dict['nb_dense_dims']) self.batch_size = int(param_dict['batch_size']) self.nb_left_tokens = int(param_dict['nb_left_tokens']) self.nb_right_tokens = int(param_dict['nb_right_tokens']) self.nb_context_tokens = self.nb_left_tokens + self.nb_right_tokens self.nb_embedding_dims = int(param_dict['nb_embedding_dims']) self.model_dir = param_dict['model_dir'] self.postcorrect = bool(param_dict['postcorrect']) self.nb_filters = int(param_dict['nb_filters']) self.filter_length = int(param_dict['filter_length']) self.focus_repr = param_dict['focus_repr'] self.dropout_level = float(param_dict['dropout_level']) self.include_token = param_dict['include_token'] self.include_context = param_dict['include_context'] self.include_lemma = param_dict['include_lemma'] self.include_pos = param_dict['include_pos'] self.include_morph = param_dict['include_morph'] self.include_dev = param_dict['include_dev'] self.include_test = param_dict['include_test'] self.min_token_freq_emb = int(param_dict['min_token_freq_emb']) self.halve_lr_at = int(param_dict['halve_lr_at']) self.max_token_len = int(param_dict['max_token_len']) self.min_lem_cnt = int(param_dict['min_lem_cnt']) # create a models directory if it isn't there already: if not os.path.isdir(self.model_dir): os.mkdir(model_dir) # initialize: self.setup = False self.curr_nb_epochs = 0 self.train_tokens, self.dev_tokens, self.test_tokens = None, None, None self.train_lemmas, self.dev_lemmas, self.test_lemmas = None, None, None self.train_pos, self.dev_pos, self.test_pos = None, None, None self.train_morph, self.dev_morph, self.test_morph = None, None, None if load: self.load() def load(self): print('Re-loading preprocessor...') self.preprocessor = pickle.load(open(os.sep.join((self.model_dir, \ 'preprocessor.p')), 'rb')) print('Re-loading pretrainer...') self.pretrainer = pickle.load(open(os.sep.join((self.model_dir, \ 'pretrainer.p')), 'rb')) print('Re-building model...') self.model = model_from_json(open(os.sep.join((self.model_dir, 'model_architecture.json'))).read()) self.model.load_weights(os.sep.join((self.model_dir, 'model_weights.hdf5'))) loss_dict = {} idx_cnt = 0 if self.include_lemma: loss_dict['lemma_out'] = 'categorical_crossentropy' self.lemma_out_idx = idx_cnt idx_cnt += 1 print('Loading known lemmas...') self.known_lemmas = pickle.load(open(os.sep.join((self.model_dir, \ 'known_lemmas.p')), 'rb')) if self.include_pos: loss_dict['pos_out'] = 'categorical_crossentropy' self.pos_out_idx = idx_cnt idx_cnt += 1 if self.include_morph: self.morph_out_idx = idx_cnt idx_cnt += 1 if self.include_morph == 'label': loss_dict['morph_out'] = 'categorical_crossentropy' elif self.include_morph == 'multilabel': loss_dict['morph_out'] = 'binary_crossentropy' self.model.compile(optimizer='adadelta', loss=loss_dict) def setup_to_train(self, train_data=None, dev_data=None, test_data=None): # create a model directory: if os.path.isdir(self.model_dir): shutil.rmtree(self.model_dir) os.mkdir(self.model_dir) self.train_tokens = train_data['token'] if self.include_test: self.test_tokens = test_data['token'] if self.include_dev: self.dev_tokens = dev_data['token'] idx_cnt = 0 if self.include_lemma: self.lemma_out_idx = idx_cnt idx_cnt += 1 self.train_lemmas = train_data['lemma'] self.known_lemmas = set(self.train_lemmas) if self.include_dev: self.dev_lemmas = dev_data['lemma'] if self.include_test: self.test_lemmas = test_data['lemma'] if self.include_pos: self.pos_out_idx = idx_cnt idx_cnt += 1 self.train_pos = train_data['pos'] if self.include_dev: self.dev_pos = dev_data['pos'] if self.include_test: self.test_pos = test_data['pos'] if self.include_morph: self.morph_out_idx = idx_cnt self.train_morph = train_data['morph'] if self.include_dev: self.dev_morph = dev_data['morph'] if self.include_test: self.test_morph = test_data['morph'] self.preprocessor = Preprocessor().fit(tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph, include_lemma=self.include_lemma, include_morph=self.include_morph, max_token_len=self.max_token_len, focus_repr=self.focus_repr, min_lem_cnt=self.min_lem_cnt, ) self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens, nb_right_tokens=self.nb_right_tokens, size=self.nb_embedding_dims, minimum_count=self.min_token_freq_emb) self.pretrainer.fit(tokens=self.train_tokens) train_transformed = self.preprocessor.transform(tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph) if self.include_dev: dev_transformed = self.preprocessor.transform(tokens=self.dev_tokens, lemmas=self.dev_lemmas, pos=self.dev_pos, morph=self.dev_morph) if self.include_test: test_transformed = self.preprocessor.transform(tokens=self.test_tokens, lemmas=self.test_lemmas, pos=self.test_pos, morph=self.test_morph) self.train_X_focus = train_transformed['X_focus'] if self.include_dev: self.dev_X_focus = dev_transformed['X_focus'] if self.include_test: self.test_X_focus = test_transformed['X_focus'] if self.include_lemma: self.train_X_lemma = train_transformed['X_lemma'] if self.include_dev: self.dev_X_lemma = dev_transformed['X_lemma'] if self.include_test: self.test_X_lemma = test_transformed['X_lemma'] if self.include_pos: self.train_X_pos = train_transformed['X_pos'] if self.include_dev: self.dev_X_pos = dev_transformed['X_pos'] if self.include_test: self.test_X_pos = test_transformed['X_pos'] if self.include_morph: self.train_X_morph = train_transformed['X_morph'] if self.include_dev: self.dev_X_morph = dev_transformed['X_morph'] if self.include_test: self.test_X_morph = test_transformed['X_morph'] self.train_contexts = self.pretrainer.transform(tokens=self.train_tokens) if self.include_dev: self.dev_contexts = self.pretrainer.transform(tokens=self.dev_tokens) if self.include_test: self.test_contexts = self.pretrainer.transform(tokens=self.test_tokens) print('Building model...') nb_tags = None try: nb_tags = len(self.preprocessor.pos_encoder.classes_) except AttributeError: pass nb_morph_cats = None try: nb_morph_cats = self.preprocessor.nb_morph_cats except AttributeError: pass max_token_len, token_char_dict = None, None try: max_token_len = self.preprocessor.max_token_len token_char_dict = self.preprocessor.token_char_dict except AttributeError: pass max_lemma_len, lemma_char_dict = None, None try: max_lemma_len = self.preprocessor.max_lemma_len lemma_char_dict = self.preprocessor.lemma_char_dict except AttributeError: pass nb_lemmas = None try: nb_lemmas = len(self.preprocessor.lemma_encoder.classes_) except AttributeError: pass self.model = build_model(token_len=max_token_len, token_char_vector_dict=token_char_dict, lemma_len=max_lemma_len, nb_tags=nb_tags, nb_morph_cats=nb_morph_cats, lemma_char_vector_dict=lemma_char_dict, nb_encoding_layers=self.nb_encoding_layers, nb_dense_dims=self.nb_dense_dims, nb_embedding_dims=self.nb_embedding_dims, nb_train_tokens=len(self.pretrainer.train_token_vocab), nb_context_tokens=self.nb_context_tokens, pretrained_embeddings=self.pretrainer.pretrained_embeddings, include_token=self.include_token, include_context=self.include_context, include_lemma=self.include_lemma, include_pos=self.include_pos, include_morph=self.include_morph, nb_filters = self.nb_filters, filter_length = self.filter_length, focus_repr = self.focus_repr, dropout_level = self.dropout_level, nb_lemmas = nb_lemmas, ) self.save() self.setup = True def train(self, nb_epochs=None): if nb_epochs: self.nb_epochs = nb_epochs for i in range(self.nb_epochs): scores = self.epoch() return scores def print_stats(self): print('Train stats:') utils.stats(tokens=self.train_tokens, lemmas=self.train_lemmas, known=self.preprocessor.known_tokens) print('Test stats:') utils.stats(tokens=self.test_tokens, lemmas=self.test_lemmas, known=self.preprocessor.known_tokens) def test(self, multilabel_threshold=0.5): if not self.include_test: raise ValueError('Please do not call .test() if no test data is available.') score_dict = {} # get test predictions: test_in = {} if self.include_token: test_in['focus_in'] = self.test_X_focus if self.include_context: test_in['context_in'] = self.test_contexts test_preds = self.model.predict(test_in, batch_size=self.batch_size) if isinstance(test_preds, np.ndarray): test_preds = [test_preds] if self.include_lemma: print('::: Test scores (lemmas) :::') pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=test_preds[self.lemma_out_idx]) if self.postcorrect: for i in range(len(pred_lemmas)): if pred_lemmas[i] not in self.known_lemmas: pred_lemmas[i] = min(self.known_lemmas, key=lambda x: editdistance.eval(x, pred_lemmas[i])) score_dict['test_lemma'] = evaluation.single_label_accuracies(gold=self.test_lemmas, silver=pred_lemmas, test_tokens=self.test_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_pos: print('::: Test scores (pos) :::') pred_pos = self.preprocessor.inverse_transform_pos(predictions=test_preds[self.pos_out_idx]) score_dict['test_pos'] = evaluation.single_label_accuracies(gold=self.test_pos, silver=pred_pos, test_tokens=self.test_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_morph: print('::: Test scores (morph) :::') pred_morph = self.preprocessor.inverse_transform_morph(predictions=test_preds[self.morph_out_idx], threshold=multilabel_threshold) if self.include_morph == 'label': score_dict['test_morph'] = evaluation.single_label_accuracies(gold=self.test_morph, silver=pred_morph, test_tokens=self.test_tokens, known_tokens=self.preprocessor.known_tokens) elif self.include_morph == 'multilabel': score_dict['test_morph'] = evaluation.multilabel_accuracies(gold=self.test_morph, silver=pred_morph, test_tokens=self.test_tokens, known_tokens=self.preprocessor.known_tokens) return score_dict def save(self): # save architecture: json_string = self.model.to_json() with open(os.sep.join((self.model_dir, 'model_architecture.json')), 'wb') as f: f.write(json_string) # save weights: self.model.save_weights(os.sep.join((self.model_dir, 'model_weights.hdf5')), \ overwrite=True) # save preprocessor: with open(os.sep.join((self.model_dir, 'preprocessor.p')), 'wb') as f: pickle.dump(self.preprocessor, f) # save pretrainer: with open(os.sep.join((self.model_dir, 'pretrainer.p')), 'wb') as f: pickle.dump(self.pretrainer, f) if self.include_lemma: # save known lemmas: with open(os.sep.join((self.model_dir, 'known_lemmas.p')), 'wb') as f: pickle.dump(self.known_lemmas, f) # save config file: if self.config_path: # make sure that we can reproduce parametrization when reloading: if not self.config_path == os.sep.join((self.model_dir, 'config.txt')): shutil.copy(self.config_path, os.sep.join((self.model_dir, 'config.txt'))) else: with open(os.sep.join((self.model_dir, 'config.txt')), 'w') as F: F.write('# Parameter file\n\n[global]\n') F.write('nb_encoding_layers = '+str(self.nb_encoding_layers)+'\n') F.write('nb_dense_dims = '+str(self.nb_dense_dims)+'\n') F.write('batch_size = '+str(self.batch_size)+'\n') F.write('nb_left_tokens = '+str(self.nb_left_tokens)+'\n') F.write('nb_right_tokens = '+str(self.nb_right_tokens)+'\n') F.write('nb_embedding_dims = '+str(self.nb_embedding_dims)+'\n') F.write('model_dir = '+str(self.model_dir)+'\n') F.write('postcorrect = '+str(self.postcorrect)+'\n') F.write('nb_filters = '+str(self.nb_filters)+'\n') F.write('filter_length = '+str(self.filter_length)+'\n') F.write('focus_repr = '+str(self.focus_repr)+'\n') F.write('dropout_level = '+str(self.dropout_level)+'\n') F.write('include_token = '+str(self.include_context)+'\n') F.write('include_context = '+str(self.include_context)+'\n') F.write('include_lemma = '+str(self.include_lemma)+'\n') F.write('include_pos = '+str(self.include_pos)+'\n') F.write('include_morph = '+str(self.include_morph)+'\n') F.write('include_dev = '+str(self.include_dev)+'\n') F.write('include_test = '+str(self.include_test)+'\n') F.write('nb_epochs = '+str(self.nb_epochs)+'\n') F.write('halve_lr_at = '+str(self.halve_lr_at)+'\n') F.write('max_token_len = '+str(self.max_token_len)+'\n') F.write('min_token_freq_emb = '+str(self.min_token_freq_emb)+'\n') F.write('min_lem_cnt = '+str(self.min_lem_cnt)+'\n') # plot current embeddings: if self.include_context: layer_dict = dict([(layer.name, layer) for layer in self.model.layers]) weights = layer_dict['context_embedding'].get_weights()[0] X = np.array([weights[self.pretrainer.train_token_vocab.index(w), :] \ for w in self.pretrainer.mfi \ if w in self.pretrainer.train_token_vocab], dtype='float32') # dimension reduction: tsne = TSNE(n_components=2) coor = tsne.fit_transform(X) # unsparsify plt.clf(); sns.set_style('dark') sns.plt.rcParams['axes.linewidth'] = 0.4 fig, ax1 = sns.plt.subplots() labels = self.pretrainer.mfi # first plot slices: x1, x2 = coor[:,0], coor[:,1] ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none') # clustering on top (add some colouring): clustering = AgglomerativeClustering(linkage='ward', affinity='euclidean', n_clusters=8) clustering.fit(coor) # add names: for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_): ax1.text(x, y, name, ha='center', va="center", color=plt.cm.spectral(cluster_label / 10.), fontdict={'family': 'Arial', 'size': 8}) # control aesthetics: ax1.set_xlabel(''); ax1.set_ylabel('') ax1.set_xticklabels([]); ax1.set_xticks([]) ax1.set_yticklabels([]); ax1.set_yticks([]) sns.plt.savefig(os.sep.join((self.model_dir, 'embed_after.pdf')), bbox_inches=0) def epoch(self, autosave=True): if not self.setup: raise ValueError('Not set up yet... Call Tagger.setup_() first.') # update nb of epochs ran so far: self.curr_nb_epochs += 1 print("-> epoch ", self.curr_nb_epochs, "...") if self.curr_nb_epochs and self.halve_lr_at: # update learning rate at specific points: if self.curr_nb_epochs % self.halve_lr_at == 0: old_lr = self.model.optimizer.lr.get_value() new_lr = np.float32(old_lr * 0.5) self.model.optimizer.lr.set_value(new_lr) print('\t- Lowering learning rate > was:', old_lr, ', now:', new_lr) # get inputs and outputs straight: train_in, train_out = {}, {} if self.include_token: train_in['focus_in'] = self.train_X_focus if self.include_context: train_in['context_in'] = self.train_contexts if self.include_lemma: train_out['lemma_out'] = self.train_X_lemma if self.include_pos: train_out['pos_out'] = self.train_X_pos if self.include_morph: train_out['morph_out'] = self.train_X_morph self.model.fit(train_in, train_out, nb_epoch = 1, shuffle = True, batch_size = self.batch_size) # get train preds: train_preds = self.model.predict(train_in, batch_size=self.batch_size) if isinstance(train_preds, np.ndarray): train_preds = [train_preds] if self.include_dev: dev_in = {} if self.include_token: dev_in['focus_in'] = self.dev_X_focus if self.include_context: dev_in['context_in'] = self.dev_contexts dev_preds = self.model.predict(dev_in, batch_size=self.batch_size) if isinstance(dev_preds, np.ndarray): dev_preds = [dev_preds] score_dict = {} if self.include_lemma: print('::: Train scores (lemmas) :::') pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=train_preds[self.lemma_out_idx]) score_dict['train_lemma'] = evaluation.single_label_accuracies(gold=self.train_lemmas, silver=pred_lemmas, test_tokens=self.train_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_dev: print('::: Dev scores (lemmas) :::') pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=dev_preds[self.lemma_out_idx]) score_dict['dev_lemma'] = evaluation.single_label_accuracies(gold=self.dev_lemmas, silver=pred_lemmas, test_tokens=self.dev_tokens, known_tokens=self.preprocessor.known_tokens) if self.postcorrect: print('::: Dev scores (lemmas) -> postcorrected :::') for i in range(len(pred_lemmas)): if pred_lemmas[i] not in self.known_lemmas: pred_lemmas[i] = min(self.known_lemmas, key=lambda x: editdistance.eval(x, pred_lemmas[i])) score_dict['dev_lemma_postcorrect'] = evaluation.single_label_accuracies(gold=self.dev_lemmas, silver=pred_lemmas, test_tokens=self.dev_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_pos: print('::: Train scores (pos) :::') pred_pos = self.preprocessor.inverse_transform_pos(predictions=train_preds[self.pos_out_idx]) score_dict['train_pos'] = evaluation.single_label_accuracies(gold=self.train_pos, silver=pred_pos, test_tokens=self.train_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_dev: print('::: Dev scores (pos) :::') pred_pos = self.preprocessor.inverse_transform_pos(predictions=dev_preds[self.pos_out_idx]) score_dict['dev_pos'] = evaluation.single_label_accuracies(gold=self.dev_pos, silver=pred_pos, test_tokens=self.dev_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_morph: print('::: Train scores (morph) :::') pred_morph = self.preprocessor.inverse_transform_morph(predictions=train_preds[self.morph_out_idx]) if self.include_morph == 'label': score_dict['train_morph'] = evaluation.single_label_accuracies(gold=self.train_morph, silver=pred_morph, test_tokens=self.train_tokens, known_tokens=self.preprocessor.known_tokens) elif self.include_morph == 'multilabel': score_dict['train_morph'] = evaluation.multilabel_accuracies(gold=self.train_morph, silver=pred_morph, test_tokens=self.train_tokens, known_tokens=self.preprocessor.known_tokens) if self.include_dev: print('::: Dev scores (morph) :::') pred_morph = self.preprocessor.inverse_transform_morph(predictions=dev_preds[self.morph_out_idx]) if self.include_morph == 'label': score_dict['dev_morph'] = evaluation.single_label_accuracies(gold=self.train_morph, silver=pred_morph, test_tokens=self.dev_tokens, known_tokens=self.preprocessor.known_tokens) elif self.include_morph == 'multilabel': score_dict['dev_morph'] = evaluation.multilabel_accuracies(gold=self.train_morph, silver=pred_morph, test_tokens=self.dev_tokens, known_tokens=self.preprocessor.known_tokens) if autosave: self.save() return score_dict def annotate(self, tokens): X_focus = self.preprocessor.transform(tokens=tokens)['X_focus'] X_context = self.pretrainer.transform(tokens=tokens) # get predictions: new_in = {} if self.include_token: new_in['focus_in'] = X_focus if self.include_context: new_in['context_in'] = X_context preds = self.model.predict(new_in) if isinstance(preds, np.ndarray): preds = [preds] annotation_dict = {'tokens': tokens} if self.include_lemma: pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx]) annotation_dict['lemmas'] = pred_lemmas if self.postcorrect: for i in range(len(pred_lemmas)): if pred_lemmas[i] not in self.known_lemmas: pred_lemmas[i] = min(self.known_lemmas, key=lambda x: editdistance.eval(x, pred_lemmas[i])) annotation_dict['postcorrect_lemmas'] = pred_lemmas if self.include_pos: pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx]) annotation_dict['pos'] = pred_pos if self.include_morph: pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx]) annotation_dict['morph'] = pred_morph return annotation_dict
def setup_to_train(self, train_data=None, dev_data=None, test_data=None): # create a model directory: if os.path.isdir(self.model_dir): shutil.rmtree(self.model_dir) os.mkdir(self.model_dir) self.train_tokens = train_data['token'] if self.include_test: self.test_tokens = test_data['token'] if self.include_dev: self.dev_tokens = dev_data['token'] idx_cnt = 0 if self.include_lemma: self.lemma_out_idx = idx_cnt idx_cnt += 1 self.train_lemmas = train_data['lemma'] self.known_lemmas = set(self.train_lemmas) if self.include_dev: self.dev_lemmas = dev_data['lemma'] if self.include_test: self.test_lemmas = test_data['lemma'] if self.include_pos: self.pos_out_idx = idx_cnt idx_cnt += 1 self.train_pos = train_data['pos'] if self.include_dev: self.dev_pos = dev_data['pos'] if self.include_test: self.test_pos = test_data['pos'] if self.include_morph: self.morph_out_idx = idx_cnt self.train_morph = train_data['morph'] if self.include_dev: self.dev_morph = dev_data['morph'] if self.include_test: self.test_morph = test_data['morph'] self.preprocessor = Preprocessor().fit(tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph, include_lemma=self.include_lemma, include_morph=self.include_morph, max_token_len=self.max_token_len, focus_repr=self.focus_repr, min_lem_cnt=self.min_lem_cnt, ) self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens, nb_right_tokens=self.nb_right_tokens, size=self.nb_embedding_dims, minimum_count=self.min_token_freq_emb) self.pretrainer.fit(tokens=self.train_tokens) train_transformed = self.preprocessor.transform(tokens=self.train_tokens, lemmas=self.train_lemmas, pos=self.train_pos, morph=self.train_morph) if self.include_dev: dev_transformed = self.preprocessor.transform(tokens=self.dev_tokens, lemmas=self.dev_lemmas, pos=self.dev_pos, morph=self.dev_morph) if self.include_test: test_transformed = self.preprocessor.transform(tokens=self.test_tokens, lemmas=self.test_lemmas, pos=self.test_pos, morph=self.test_morph) self.train_X_focus = train_transformed['X_focus'] if self.include_dev: self.dev_X_focus = dev_transformed['X_focus'] if self.include_test: self.test_X_focus = test_transformed['X_focus'] if self.include_lemma: self.train_X_lemma = train_transformed['X_lemma'] if self.include_dev: self.dev_X_lemma = dev_transformed['X_lemma'] if self.include_test: self.test_X_lemma = test_transformed['X_lemma'] if self.include_pos: self.train_X_pos = train_transformed['X_pos'] if self.include_dev: self.dev_X_pos = dev_transformed['X_pos'] if self.include_test: self.test_X_pos = test_transformed['X_pos'] if self.include_morph: self.train_X_morph = train_transformed['X_morph'] if self.include_dev: self.dev_X_morph = dev_transformed['X_morph'] if self.include_test: self.test_X_morph = test_transformed['X_morph'] self.train_contexts = self.pretrainer.transform(tokens=self.train_tokens) if self.include_dev: self.dev_contexts = self.pretrainer.transform(tokens=self.dev_tokens) if self.include_test: self.test_contexts = self.pretrainer.transform(tokens=self.test_tokens) print('Building model...') nb_tags = None try: nb_tags = len(self.preprocessor.pos_encoder.classes_) except AttributeError: pass nb_morph_cats = None try: nb_morph_cats = self.preprocessor.nb_morph_cats except AttributeError: pass max_token_len, token_char_dict = None, None try: max_token_len = self.preprocessor.max_token_len token_char_dict = self.preprocessor.token_char_dict except AttributeError: pass max_lemma_len, lemma_char_dict = None, None try: max_lemma_len = self.preprocessor.max_lemma_len lemma_char_dict = self.preprocessor.lemma_char_dict except AttributeError: pass nb_lemmas = None try: nb_lemmas = len(self.preprocessor.lemma_encoder.classes_) except AttributeError: pass self.model = build_model(token_len=max_token_len, token_char_vector_dict=token_char_dict, lemma_len=max_lemma_len, nb_tags=nb_tags, nb_morph_cats=nb_morph_cats, lemma_char_vector_dict=lemma_char_dict, nb_encoding_layers=self.nb_encoding_layers, nb_dense_dims=self.nb_dense_dims, nb_embedding_dims=self.nb_embedding_dims, nb_train_tokens=len(self.pretrainer.train_token_vocab), nb_context_tokens=self.nb_context_tokens, pretrained_embeddings=self.pretrainer.pretrained_embeddings, include_token=self.include_token, include_context=self.include_context, include_lemma=self.include_lemma, include_pos=self.include_pos, include_morph=self.include_morph, nb_filters = self.nb_filters, filter_length = self.filter_length, focus_repr = self.focus_repr, dropout_level = self.dropout_level, nb_lemmas = nb_lemmas, ) self.save() self.setup = True
plot = False # Init variables print("Init all variables") coords = np.loadtxt("patch_coordinates.txt", delimiter="\t", skiprows=1) coords = np.multiply(coords, scale_factor) patches = {'white':coords[:,0:2], 'brown':coords[:,2:4], 'gray':coords[:,4:6], 'green':coords[:,6:8]} box_size *= scale_factor # Load Preprocessor print("Preprocessing") p = Preprocessor("../images/slum_image.jpg") p.scale_image(scale_factor) p.save_current_as("normal") p.exposure_equalization(method="contrast") p.convert_color("RGB","RGB CIE") p.save_current_as("contrast_rgb_cie") p.reset() p.scale_image(scale_factor) p.exposure_equalization(method="equal") p.convert_color("RGB","HSV") p.save_current_as("structure") # ========== Plot img & patches ========= if plot:
def plot_intervals(output_folder): from parsers import CVOutputParser from preprocessing import Preprocessor from utils import avg import os import math """ Given a cross validation ouput. Certain triple intervals can be plottet to compare the error for extrapolation, max ent and the heurestic. The algorithm runs through each triple interval, and then for each sampled estiamte output the triples in the interval are looked up in each sample and the MAPE error is recorded and the average errors are added. And the average of these averages are then plottet for each interval. """ if not output_folder[-1] == '/': output_folder += '/' intervals = 30 triple_intervals = Preprocessor.triple_intervals(output_folder + 'observed_frequent_items.out', intervals=intervals) avg_max_ent_errors = [] avg_ext_errors = [] avg_heu_errors = [] pair_triple_ratios = [i/10. for i in range(11)] # binned ratios [0.0 to 1.0] max_ent_ratio_error = [0 for i in range(11)] ext_ratio_error = [0 for i in range(11)] for index, triple_interval in enumerate(triple_intervals): print 'Triple interval {} of {}'.format(index, intervals) iteration = 0 MAPE_avg_errors = [] MAPE_avg_errors_ext = [] # MAPE_avg_errors_heu = [] while True: max_ent_est_file = output_folder + str(iteration) + '_data.tsv' ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv' # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv' # read baseline also? # Read until we do not find an output file if not os.path.exists(max_ent_est_file): break max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file) ext_est = CVOutputParser.read_est_obs_file(ext_est_file) # heu_est = CVOutputParser.read_est_obs_file(heu_est_file) MAPE_errors = [] MAPE_errors_ext = [] # MAPE_errors_heu = [] for triple in triple_interval: # Check that the triple has been estimated if triple in max_ent_est: # Index 1 should hold the observed value parsed from the file # is the same mapped to every estimate, so hust read it once. obs = max_ent_est[triple][1] # maxent estimate est = max_ent_est[triple][0] # extrapolation estimate est2 = ext_est[triple][0] # # independence estimat? # heurestic, use max_ent for 0 triple in sample # est4 = heu_est[triple][0] # Index 2 should hold the pair triple ratio. # is the sam for every estimat ratio = max_ent_est[triple][2] # bin the ratio to one decimal ratio_binned = round(ratio, 1) # add errors to the ratio max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est-obs) / float(obs) ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est2-obs) / float(obs) # MAPE error max ent # error = abs(obs-est) #/ float(obs) * 100 # MAPE_errors.append(error) # # MAPE error extrapolation # error2 = abs(obs-est2) #/ float(obs) * 100 # MAPE_errors_ext.append(error2) # MAPE error independence? # MAPE error heurestic # error4 = abs(obs-est4) #/ float(obs) * 100 # MAPE_errors_heu.append(error4) # MAPE baseline error? MAPE_avg_errors.append(avg(MAPE_errors)) MAPE_avg_errors_ext.append(avg(MAPE_errors_ext)) # MAPE_avg_errors_heu.append(avg(MAPE_errors_heu)) iteration += 1 avg_max_ent_errors.append(avg(MAPE_avg_errors)) avg_ext_errors.append(avg(MAPE_avg_errors_ext)) # avg_heu_errors.append(avg(MAPE_avg_errors_heu)) plot(range(len(avg_max_ent_errors)), avg_max_ent_errors, color='blue') plot(range(len(avg_ext_errors)), avg_ext_errors, color='red')