class LoadPolicy(object): def __init__(self, exp_dir, iter): model_dir = exp_dir + '/models' parser = argparse.ArgumentParser() params = json.loads(open(exp_dir + '/config.json').read()) for key, val in params.items(): parser.add_argument("-" + key, default=val) self.args = parser.parse_args() self.policy = Policy4Toyota(self.args) self.policy.load_weights(model_dir, iter) self.preprocessor = Preprocessor((self.args.obs_dim, ), self.args.obs_preprocess_type, self.args.reward_preprocess_type, self.args.obs_scale, self.args.reward_scale, self.args.reward_shift, gamma=self.args.gamma) # self.preprocessor.load_params(load_dir) @tf.function def run(self, obs): processed_obs = self.preprocessor.tf_process_obses(obs) action, logp = self.policy.compute_action(processed_obs[np.newaxis, :]) return action[0] @tf.function def values(self, obs): processed_obs = self.preprocessor.tf_process_obses(obs) obj_v = self.policy.compute_obj_v(processed_obs) con_v = self.policy.compute_con_v(processed_obs) return obj_v, con_v
def _data_loader_fn(): feats_preps = [ Preprocessor(vocab, add_bos=False, add_eos=False) for vocab in feats_vocabs ] labels_prep = Preprocessor(labels_vocab, add_bos=False, add_eos=False) feats_readers = [TextFileReader(path) for path in feats_paths] labels_reader = TextFileReader(labels_path) feats_gen = [ SentenceGenerator(reader, vocab, args.batch_size, max_length=args.max_len, preprocessor=prep, allow_residual=True) for reader, vocab, prep in zip(feats_readers, feats_vocabs, feats_preps) ] labels_gen = SentenceGenerator( labels_reader, labels_vocab, args.batch_size, max_length=args.max_len, preprocessor=labels_prep, allow_residual=True, ) return feats_gen + [labels_gen]
def main(): a = input() b = output() prep1 = Preprocessor() i = 1 c = weights() f = feature_set() while 1: try: with open('%d.txt' % i): inputdataset = a.input_from_file('%d.txt' % i) filter1 = prep1.to_lower_case(inputdataset) filter2 = prep1.stop_word_eliminate(filter1) filter3 = prep1.stem_word(filter2) b.write_to_file('out%i.txt' % i, filter3) f.all_features += filter3 + ' ' i += 1 except IOError: break f.get_tot_files(i-1) f.update_unique_features(f.all_features) for each_word in f.unique_features.split(): c.ret_tot_freq().append(f.all_features.count(each_word)) #j = 0 #for each_word in f.unique_features.split(): # stdout.write(each_word + ' ' + str(c.ret_tot_freq()[j]) + '\n') # j += 1 c.update_term_freq_matrix(f.ret_tot_files()) c.update_inverse_document_freq(f.ret_tot_files(),f.unique_features)
def __training_setup(self, input_data): """ Method to initialize all the sub models/objects used as part of the classifier model""" logger.info("Setting up model for classifier") # Get Data if provided self.preprocessor = Preprocessor(input_data) self.x_train, self.x_test, self.y_train, self.y_test = self.preprocessor.get_data() logger.info("Setting up Vectorizer") # Vectorizer if self.vectorizer_type == 'tfidf': self.vectorizer = TfidfLocVectorizer(max_feat=self.max_feat, maxdf=0.8, mindf=15, n_gram_range=(1, 3)) elif self.vectorizer_type == 'spacy': import spacy from utils.spacy_vectorizer import SpacyVectorTransformer nlp = spacy.load("en_core_web_md") self.vectorizer = SpacyVectorTransformer(nlp=nlp) else: raise ValueError("incorrect vectorizer_type, please use tfidf or spacy") # Balance the data if self.use_data_under_balancer: logger.info("Setting up Naive Balance the data") self.data_under_balancer = RandomUnderSampler(sampling_strategy= {l: min(70, number - 1) for l, number in self.y_test.value_counts().items()}) logger.info("Run dimension reduction algorithm") self.dimension_reduction = TruncatedLocSVD(self.optimum_n_components, total_variance=0.8) logger.info("Setting up Classifier") # Classifier if self.classifier_type == 'xgb': self.classifier = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=5, min_child_weight=11, n_estimators=1000, n_jobs=4, objective='binary:multiclass', random_state=RANDOM_STATE, subsample=0.8) elif self.classifier_type == 'lgbm': params = {'num_leaves': 5, 'objective': 'multiclass', 'num_class': len(np.unique(self.y_train)), 'learning_rate': 0.01, 'max_depth': 5, 'random_state': RANDOM_STATE } self.classifier = lgb.LGBMClassifier(**params) else: self.classifier = LogisticRegression(multi_class="multinomial", class_weight='balanced', solver='newton-cg', max_iter=100) # MLFlow Config logger.info("Setting up MLFlow Config") mlflow.set_experiment('classifier-model')
def generate_STM(self): preprocessor = Preprocessor() for sentence in self.__sentenceList: preprocessed_words = preprocessor.preprocess_sentence(sentence) sentence_weight = [] for feature in self.tot_weight_dict().keys(): if feature in preprocessed_words: sentence_weight.append(self.__tot_weight_dict[feature]) else: sentence_weight.append(0) self.__sentenceWeight_dict[sentence] = sentence_weight
class LoadPolicy(object): def __init__(self, exp_dir, iter): model_dir = exp_dir + '/models' parser = argparse.ArgumentParser() params = json.loads(open(exp_dir + '/config.json').read()) for key, val in params.items(): parser.add_argument("-" + key, default=val) self.args = parser.parse_args() env = CrossroadEnd2end( training_task=self.args.env_kwargs_training_task, num_future_data=self.args.env_kwargs_num_future_data) self.policy = Policy4Toyota(self.args) self.policy.load_weights(model_dir, iter) self.preprocessor = Preprocessor((self.args.obs_dim, ), self.args.obs_preprocess_type, self.args.reward_preprocess_type, self.args.obs_scale, self.args.reward_scale, self.args.reward_shift, gamma=self.args.gamma) # self.preprocessor.load_params(load_dir) init_obs = env.reset() self.run(init_obs) self.obj_value(init_obs) @tf.function def run(self, obs): processed_obs = self.preprocessor.np_process_obses(obs) action, _ = self.policy.compute_action(processed_obs[np.newaxis, :]) return action[0] @tf.function def obj_value(self, obs): processed_obs = self.preprocessor.np_process_obses(obs) value = self.policy.compute_obj_v(processed_obs[np.newaxis, :]) return value @tf.function def run_batch(self, obses): processed_obses = self.preprocessor.np_process_obses(obses) actions, _ = self.policy.compute_action(processed_obses) return actions @tf.function def obj_value_batch(self, obses): processed_obses = self.preprocessor.np_process_obses(obses) values = self.policy.compute_obj_v(processed_obses) return values
class CharRNN(object): def __init__(self, weights, encoding, rnn_type, depth, hidden_size, softmax_temp=0.9, output_lim=144): self.preprocessor = Preprocessor() self.model = char_rnn.Model(weights, encoding, rnn_type, depth, hidden_size) self.output_lim = output_lim self.temperature = softmax_temp self.buffer = '\n' def set_input(self, text): if len(text) > 0: data = self.preprocessor.process_text(text, newline=False) if len(data) == 0 or data[-1] == '\n': data = '\n' for c in data: self.buffer = str(self.model.forward(c, self.temperature)) else: self.buffer = self.buffer[-1] def get_output(self): for i in range(self.output_lim - 1): c = self.buffer[-1] self.buffer += str(self.model.forward(c, self.temperature)) if self.buffer[-1] == '\n': break return self.buffer
def __init__(self, exp_dir, iter): model_dir = exp_dir + '/models' parser = argparse.ArgumentParser() params = json.loads(open(exp_dir + '/config.json').read()) for key, val in params.items(): parser.add_argument("-" + key, default=val) self.args = parser.parse_args() self.policy = Policy4Toyota(self.args) self.policy.load_weights(model_dir, iter) self.preprocessor = Preprocessor((self.args.obs_dim, ), self.args.obs_preprocess_type, self.args.reward_preprocess_type, self.args.obs_scale, self.args.reward_scale, self.args.reward_shift, gamma=self.args.gamma)
def __init__(self, exp_dir, iter): model_dir = exp_dir + '/models' parser = argparse.ArgumentParser() params = json.loads(open(exp_dir + '/config.json').read()) for key, val in params.items(): parser.add_argument("-" + key, default=val) self.args = parser.parse_args() env = CrossroadEnd2end(training_task=self.args.env_kwargs_training_task, num_future_data=self.args.env_kwargs_num_future_data) self.policy = Policy4Toyota(self.args) self.policy.load_weights(model_dir, iter) self.preprocessor = Preprocessor((self.args.obs_dim,), self.args.obs_preprocess_type, self.args.reward_preprocess_type, self.args.obs_scale, self.args.reward_scale, self.args.reward_shift, gamma=self.args.gamma) # self.preprocessor.load_params(load_dir) init_obs = env.reset() self.run_batch(init_obs[np.newaxis, :]) self.obj_value_batch(init_obs[np.newaxis, :])
def run(self, model, log_dir): test_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'test', loop=False) preprocessor = Preprocessor(self._config["preprocessor"]) from evaluation import EVALUATORS evaluator = EVALUATORS[self._config["evaluator"]] gt_filenames = [] prediction_filenames = [] for img_filename, label_filename in test_dataset: prediction_filename = self.trainLabelToEvalFilename(label_filename, self._config["eval_dir"]) # only run prediction if prediction image does not exist yet if not os.path.exists(prediction_filename): prediction_dir = os.path.dirname(prediction_filename) if not os.path.exists(prediction_dir): os.makedirs(prediction_dir) img = imread(img_filename) assert img is not None # feed the image through the network x = preprocessor.process(img) y_pred = model.predict(np.asarray([x])).squeeze() y_label_pred = np.argmax(y_pred, axis=2) y_label_pred = np.asarray(y_label_pred, dtype=np.uint8) y_label_pred = resize(y_label_pred, (img.shape[1], img.shape[0]), interpolation=INTER_NEAREST) # store it in the eval folder imwrite(prediction_filename, y_label_pred) gt_filenames.append(label_filename) prediction_filenames.append(prediction_filename) evaluator.run(prediction_filenames, gt_filenames)
def summarize(self,input_path): dataset_preprocessor = Preprocessor() dataset_FeatureReducer = FeatureReducer() dataset_WeightsHandler = WeightsHandler() files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)] preprocessed_list = dataset_preprocessor.preprocess(files, input_path) sentencelist = dataset_preprocessor.extract_sentences(files, input_path) dataset_WeightsHandler.set_preprocessed_list(preprocessed_list) dataset_WeightsHandler.set_sentence_list(sentencelist) dataset_WeightsHandler.update_totfreq_dict() dataset_WeightsHandler.replace_totfreq_dict(dataset_FeatureReducer.reduceFeatures(dataset_WeightsHandler.tot_freq_dict())) dataset_WeightsHandler.generate_inv_doc_freq_dict(preprocessed_list) dataset_WeightsHandler.generate_tot_weight_dict() dataset_WeightsHandler.generate_STM() vector_dict = dataset_WeightsHandler.sentence_weight_dict() #vector_dict[sentence]=vector dataset_FeatureReducer.remove_features_with_zero_weight(vector_dict) sentencelist_without_stopwords = dataset_preprocessor.remove_stop_words_from_sentencelist(sentencelist) VectorSineRelationExtractor = SineRelationExtractor() sine_matrix = VectorSineRelationExtractor.extract_sine_similarity(vector_dict) synonym_assigner = SynonymAssigner() synonym_dict = synonym_assigner.assign_synonyms(sentencelist_without_stopwords) SentenceDissimilarityScorer = DissimilarityScorer() dissimilarity_matrix = SentenceDissimilarityScorer.assign_dissimilarity_score(synonym_dict, sentencelist_without_stopwords) final_score_matrix = SentenceDissimilarityScorer.multiply_sine(dissimilarity_matrix, sine_matrix) SentenceRanker = NodeRanker() scorelist_of_sentences= SentenceRanker.calculate_score_of_each_sentence(final_score_matrix) ranked_indices = SentenceRanker.rank_nodes(scorelist_of_sentences) for each_index in ranked_indices: print sentencelist[each_index]
def _data_loader_fn(): feats_preps = [Preprocessor(vocab) for vocab in feats_vocabs] feats_readers = [TextFileReader(path) for path in args.feats_path] feats_gen = [ SentenceGenerator(reader, vocab, args.batch_size, max_length=args.max_length, preprocessor=prep, allow_residual=True) for reader, vocab, prep in zip(feats_readers, feats_vocabs, feats_preps) ] return feats_gen
def run(self, model, log_dir): train_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'train') val_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'val') augmenter = Augmenter(self._config["augmenter"]) preprocessor = Preprocessor(self._config["preprocessor"]) from keras.callbacks import TensorBoard tensor_board = TensorBoard(log_dir=log_dir) from keras.optimizers import get as get_optimizer optimizer = get_optimizer(self._config["optimizer"]) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["categorical_accuracy"]) train_data_producer = TrainDataProducer(config=self._config, dataset=train_dataset, augmenter=augmenter, preprocessor=preprocessor) valid_data_producer = ValidDataProducer(config=self._config, dataset=val_dataset, preprocessor=preprocessor) train_steps_per_epoch = train_dataset.num_samples() / self._config["batch_size"] val_steps_per_epoch = val_dataset.num_samples() / self._config["batch_size"] model.fit_generator(generator=train_data_producer, steps_per_epoch=train_steps_per_epoch, validation_data=valid_data_producer, validation_steps=val_steps_per_epoch, epochs=self._config["epochs"], callbacks=[tensor_board]) # save the model in the log directory import os trained_model_filename = os.path.join(log_dir, 'trained_model.h5') print("Saving trained model to %s" % trained_model_filename) # If the saving does not work, take a look at # https://github.com/keras-team/keras/issues/6766 # and then upgrade keras! model.save(trained_model_filename)
# df_without_undesired_words = remove_undesired_words(df_without_bot_posts) # print("Row count after undesired words removal: ", len(df_without_undesired_words)) # output_filepath = OUTPUT_PATH + get_filename(original_data_path) + "[duplicates_bots_removed]" + FILE_EXTENSION # os.makedirs(os.path.dirname(output_filepath), exist_ok=True) # json.dump(df_without_undesired_words.to_dict(orient='records'), open(output_filepath, WRITE_MODE)) # print("Data without duplicates dumped to ", output_filepath) data = np.array(original_data_frame[field_of_interest], dtype='object') processor = Preprocessor(posCategories, lang, lemmatize_activated) processed_data = processor.preprocess(data, stopwords_file) print("Size of data after preprocessing: ", len(processed_data)) df_after_preprocessing = original_data_frame.assign(body=processed_data) df_after_preprocessing = df_after_preprocessing[ df_after_preprocessing['body'].map(lambda field: len(field)) > 0] print( f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}' ) output_filepath = OUTPUT_PATH + get_filename(
else: summary = (f'\n{classifier} Accuracy:\n' + ' > Before preprocessing: N/A' + '\n > After preprocessing: {:0.2f}%'.format( processedScore * 100) + '\n > Acceptable? {}'.format(acceptable)) return summary if __name__ == "__main__": predictor = Predictor(test_size=0.2, random_state=27, verbose=args.verbose, save=args.save, acceptance=args.acceptance) preprocessor = Preprocessor(verbose=args.verbose) samples = [1, 2, 3, 4, 5] if args.all else args.sample raw = [] processed = [] for i in samples: raw_res, processed_res = runTests(i) raw.append(raw_res) processed.append(processed_res) if not args.verbose: print('Format: [SVC, KNN, GNB, DTREE]\n') for i in range(len(raw)): print(f'Sample {samples[i]}:', f'\n > Raw: {raw[i]}', f'\n > Processed: {processed[i]}\n')
from utils.preprocessor import Preprocessor import os from sets.size import Size from sets.intersections import Intersections from sets.scorer import Scorer from graphs.node_ranker import NodeRanker from sets.distributed_ranks import RankDistributor input_path = '/home/animesh/T-Sum/Data sets/Inception/' files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)] prep = Preprocessor() sentence_list = prep.extract_sentences(files, input_path) preprocessed_words_in_each_sentence = [] for s in sentence_list: preprocessed_words_in_each_sentence.append(prep.preprocess_sentence(s)) size = Size() intersections = Intersections() scorer = Scorer() ranker = NodeRanker() rank_counter_in_0_to_1 = RankDistributor() size_of_sets = size.calculate_size_of_set(preprocessed_words_in_each_sentence) number_of_intersections_of_each_sentence = intersections.count_itersections_of_each_set(preprocessed_words_in_each_sentence) scores = scorer.score_sentences(number_of_intersections_of_each_sentence, size_of_sets) normalised_scores = scorer.normalise_score(scores) distributed_ranks = rank_counter_in_0_to_1.distribute_ranks(normalised_scores)
parser.add_argument('-v', '--test', type=str, help='Test dataset (filename, csv)', required=True) args = parser.parse_args() # Load data df_train = pd.read_csv(args.train, usecols=[1, 2]) X_train_raw = df_train["question"].tolist() y_train_raw = df_train["intention"].tolist() df_test = pd.read_csv(args.test, usecols=[1, 2]) X_test_raw = df_test["question"].tolist() y_test_raw = df_test["intention"].tolist() # Preproccessing preprocessor = Preprocessor() preprocessor.fit(X_train_raw, y_train_raw) X_train = preprocessor.build_sequence(X_train_raw) X_test = preprocessor.build_sequence(X_test_raw) y_train = preprocessor.label_transform(y_train_raw) y_test = preprocessor.label_transform(y_test_raw) # Intent classifier prediction params = { "batch_size": 64, "num_epochs": 1, "embedding_size": 32, "filter_sizes": [3, 4, 5], 'num_filters': 258, "patience": 20, "dropout": 0.7,
type=str, default="../data/val.raw", help="The path to output the validation data") parser.add_argument('--max_train_size', type=int, default=1e6) parser.add_argument('--max_val_size', type=int, default=0) args = parser.parse_args() if not (os.path.isfile(args.msg_path)): print("Downloading from gitter...") download_messages(args.gitter_token, args.chat_room, args.msg_path) with open(args.msg_path, 'r') as input: print("Loading messages form disk...") messages = json.load(input) preprocessor = Preprocessor() print("Preprocessing...") messages_ = [] for idx, message in enumerate(messages): if "fromUser" in message: messages[idx]['text'] = preprocessor.process_text(message['text']) messages[idx]['fromUser']['username'] = preprocessor.process_text( message['fromUser']['username'], newline=False) messages_.append(message) messages = messages_ encoder = Encoder() if not os.path.isfile(args.encoding_file): print("Generating encoding dictionary...") encoder.gen_dict(msg2txt(messages)) encoder.save_enc_dict_json(path='../data/encoding.json')
# labels.extend(subset_labels_tensor.cpu().detach().numpy()) # counter += 1 # evaluator.evaluate(labels, outputs) if ("-predict" in sys.argv): # with open(conf.readValue("lstm_model_path"), "rb") as file: # model = pickle.load(file) model = PolarityLSTM(embedding_dim, vocab_size, hidden_dim, output_size, n_layers) model.load_state_dict(torch.load(conf.readValue("lstm_model_path"))) model.eval() if ("-gpu" in sys.argv): model.cuda(device) prep = Preprocessor() index = sys.argv.index("-predict") text = sys.argv[index + 1] text = prep.setText(text).correctSpelling().setLemmatizeFlag( ).setStopWordsFlag().build() text = [text] vectorized_seqs = [] for seq in text: vectorized_seqs.append([ vocab_to_int.get(word, 1) for word in TOKENIZER.tokenize(seq) ]) seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
class ClassifierModel: """The model which helps in identifying the search term classes""" def __init__(self, input_data=None, tfidf_max_feat=500, mlflow_local=False, classifier_type: str = 'lgbm', use_data_under_balancer: bool = False, vectorizer_type: str = 'tfidf', optimum_n_components: int = None, pipeline=None, mlflow_local_url='mlruns_new', model_save_loc='Model_Save'): """ Constructor for classifier :param input_data: :param tfidf_max_feat: :param mlflow_local: :param classifier_type: :param use_data_under_balancer: :param vectorizer_type: :param optimum_n_components: :param pipeline: :param mlflow_local_url: :param model_save_loc: """ self.mlflow_local_url = mlflow_local_url self.model_save_loc = model_save_loc self.text_cleaner = CleanTextTransformer() self.classifier = None self.classifier_type = classifier_type self.vectorizer = None self.vectorizer_type = vectorizer_type self.pipeline = pipeline self.use_data_under_balancer = use_data_under_balancer self.data_under_balancer = None self.dimension_reduction = None self.optimum_n_components = optimum_n_components self.max_feat = tfidf_max_feat self.x_train = self.y_train = self.x_test = self.y_test = None self.input_data = input_data self.is_local = mlflow_local def __training_setup(self, input_data): """ Method to initialize all the sub models/objects used as part of the classifier model""" logger.info("Setting up model for classifier") # Get Data if provided self.preprocessor = Preprocessor(input_data) self.x_train, self.x_test, self.y_train, self.y_test = self.preprocessor.get_data() logger.info("Setting up Vectorizer") # Vectorizer if self.vectorizer_type == 'tfidf': self.vectorizer = TfidfLocVectorizer(max_feat=self.max_feat, maxdf=0.8, mindf=15, n_gram_range=(1, 3)) elif self.vectorizer_type == 'spacy': import spacy from utils.spacy_vectorizer import SpacyVectorTransformer nlp = spacy.load("en_core_web_md") self.vectorizer = SpacyVectorTransformer(nlp=nlp) else: raise ValueError("incorrect vectorizer_type, please use tfidf or spacy") # Balance the data if self.use_data_under_balancer: logger.info("Setting up Naive Balance the data") self.data_under_balancer = RandomUnderSampler(sampling_strategy= {l: min(70, number - 1) for l, number in self.y_test.value_counts().items()}) logger.info("Run dimension reduction algorithm") self.dimension_reduction = TruncatedLocSVD(self.optimum_n_components, total_variance=0.8) logger.info("Setting up Classifier") # Classifier if self.classifier_type == 'xgb': self.classifier = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=5, min_child_weight=11, n_estimators=1000, n_jobs=4, objective='binary:multiclass', random_state=RANDOM_STATE, subsample=0.8) elif self.classifier_type == 'lgbm': params = {'num_leaves': 5, 'objective': 'multiclass', 'num_class': len(np.unique(self.y_train)), 'learning_rate': 0.01, 'max_depth': 5, 'random_state': RANDOM_STATE } self.classifier = lgb.LGBMClassifier(**params) else: self.classifier = LogisticRegression(multi_class="multinomial", class_weight='balanced', solver='newton-cg', max_iter=100) # MLFlow Config logger.info("Setting up MLFlow Config") mlflow.set_experiment('classifier-model') def train(self, train_x=None, train_y=None, grid_search=True, run_version=None): """ Method to train the model :param train_x: independent data to train model with :param train_y: dependent data to train model with :param grid_search: perform grid_search :param run_version: Load previous run_version for training if needed :return: None """ self.__training_setup(self.input_data) logger.info("Training for search term classifier model") if not train_x: train_x = self.x_train train_y = self.y_train # Search for previous runs and get run_id if present logger.info("Searching for previous runs for given model type") df_runs = mlflow.search_runs(filter_string="tags.Model = '{0}'".format('XGB')) df_runs = df_runs.loc[~df_runs['tags.Version'].isna(), :] if 'tags.Version' in df_runs else pd.DataFrame() if not run_version: run_id = None load_prev = False else: try: run_id = df_runs.loc[df_runs['tags.Version'] == run_version, 'run_id'].iloc[0] load_prev = True except Exception as e: raise ValueError('run_id with version {0} not found'.format(run_version)) run_version = len(df_runs) + 1 # Start the MLFlow Run and train the model logger.info("Starting MLFlow run to train model") with mlflow.start_run(run_id=run_id): # Build pipeline. Load previous pipeline if needed if load_prev: artifact_uri = mlflow.get_artifact_uri(self.model_save_loc) try: load_pipeline = mlflow.sklearn.load_model(artifact_uri) self.pipeline = load_pipeline except Exception as e: raise ValueError("Existing model not found / couldn't be loaded.\n" + str(e)) else: if self.use_data_under_balancer: self.pipeline = Pipeline([('clean_text', self.text_cleaner), (self.vectorizer_type, self.vectorizer), ('balancer', self.data_under_balancer), ('dimension_reduction', self.dimension_reduction), (self.classifier_type, self.classifier)]) else: self.pipeline = Pipeline([('clean_text', self.text_cleaner), (self.vectorizer_type, self.vectorizer), ('dimension_reduction', self.dimension_reduction), (self.classifier_type, self.classifier)]) # Todo: Grid Search for LGBM if grid_search: xgb_parameters = { 'clf__njobs': [4], 'clf__objective': ['multiclass'], 'clf__learning_rate': [0.05], 'clf__max_depth': [6, 12, 18], 'clf__min_child_weight': [11, 13, 15], 'clf__subsample': [0.7, 0.8], 'clf__colsample_bytree': [0.6, 0.7], 'clf__n_estimators': [5, 50, 100, 1000], 'clf__missing': [-999], 'clf__random_state': [RANDOM_STATE] } if self.use_data_under_balancer: xgb_pipeline = Pipeline([('clean_text', self.text_cleaner), (self.vectorizer_type, self.vectorizer), ('balancer', self.data_under_balancer), ('dimension_reduction', self.dimension_reduction), ('clf', XGBClassifier())]) else: self.pipeline = Pipeline([('clean_text', self.text_cleaner), (self.vectorizer_type, self.vectorizer), ('dimension_reduction', self.dimension_reduction), (self.classifier_type, self.classifier)]) self.pipeline = GridSearchCV(xgb_pipeline, xgb_parameters, n_jobs=1, verbose=2, refit=True, cv=StratifiedKFold(n_splits=3, shuffle=True)) # Train the model self.pipeline.fit(train_x, train_y) logger.info("train is done") train_pred = self.pipeline.predict(train_x) # read the dict with correct labels with open('data/relabeling_dict.yml', 'r') as f: relabeling_dict = yaml.load(f) labeling_dict = dict(map(reversed, relabeling_dict.items())) # classification report on train set df = pd.DataFrame(classification_report(train_y, train_pred, output_dict=True)).transpose() logger.info("test is done") # Save tags and model metrics logger.info("Training Complete. Logging results into MLFlow") mlflow.log_metric("insam_macro_f1", np.round(df.loc["macro avg", "f1-score"], 5)) mlflow.log_metric("insam_weighted_f1", np.round(df.loc["weighted avg", "f1-score"], 5)) df = df.reset_index() df.columns = ['category', 'precision', 'recall', 'f1-score', 'support'] df.loc[:, 'category'] = df['category'].apply(lambda x: labeling_dict[eval(x)] if x.isdigit() else x) df.to_csv("insam_full_report.csv") mlflow.log_artifact("insam_full_report.csv") os.remove("insam_full_report.csv") # Log params if self.classifier_type in ('lgbm', 'xgb'): if grid_search: mlflow.log_param("Best Params", self.pipeline.best_params_) mlflow.log_param("Best Score", self.pipeline.best_score_) else: params = self.classifier.get_xgb_params() if self.classifier_type == 'xgb' \ else self.classifier.get_params() for key in params: mlflow.log_param(key, params[key]) else: mlflow.log_param('class_weight', 'balanced') mlflow.log_param('solver', 'newton-cg') mlflow.log_param('max_iter', 100) if len(self.x_test): test_pred = self.pipeline.predict(self.x_test) # classification report on test set test_df = pd.DataFrame(classification_report(self.y_test, test_pred, output_dict=True)).transpose() mlflow.log_metric("macro_f1", np.round(test_df.loc["macro avg", "f1-score"], 5)) mlflow.log_metric("weighted_f1", np.round(test_df.loc["weighted avg", "f1-score"], 5)) test_df = test_df.reset_index() test_df.columns = ['category', 'precision', 'recall', 'f1-score', 'support'] test_df.loc[:, 'category'] = test_df['category'].apply(lambda x: labeling_dict[eval(x)] if x.isdigit() else x) test_df.to_csv("full_report.csv") mlflow.log_artifact("full_report.csv") os.remove("full_report.csv") mlflow.sklearn.log_model(self.pipeline, self.model_save_loc, serialization_format='pickle') mlflow.set_tag("Model", self.classifier_type) mlflow.set_tag("Version", run_version) logger.info("Model Trained and saved into MLFlow artifact location") def predict(self, data_x=None, proba=False): """ Method to use the model to predict :param data_x: input :param proba: result is probability :return: """ logger.info("Predicting using classifier model") data_x = data_x.loc[:, 'search_term'] if not proba: test_pred = self.pipeline.predict(data_x) else: dis_index = list(self.pipeline.classes_).index('category') test_pred = [x[dis_index] for x in self.pipeline.predict_proba(data_x)] return test_pred
def __init__(self, weights, encoding, rnn_type, depth, hidden_size, softmax_temp=0.9, output_lim=144): self.preprocessor = Preprocessor() self.model = char_rnn.Model(weights, encoding, rnn_type, depth, hidden_size) self.output_lim = output_lim self.temperature = softmax_temp self.buffer = '\n'
'KNN_95_0426212533.pkl', 'SVC_35_0426230110.pkl', 'RF_97_0426211322.pkl', 'RF_71_0426211600.pkl' ] models = [] directory = os.path.join(os.getcwd(), 'models') for num, pickleFile in enumerate(pickleFiles, start=1): filePath = os.path.join(directory, f'Sample{num}', pickleFile) with open(filePath, 'rb') as file: print(f'Opening: {filePath}') models.append(pickle.load(file)) tester = Tester() preprocessor = Preprocessor() for idx, model in enumerate(models, start=1): print(idx) logging.info('importing data') data = importTestData(idx) print(f'Data {idx}', data.head()) # print(f'Data: {data_np}') # print(f'type: ', type(data_np)) logging.info('cleaning data') data_mean_replacement = preprocessor.replaceMissingWithMean(data) # processed_data = preprocessor.select_features(data_mean_replacement) # newData = processed_data.to_numpy() # print(f'type: {type(processed_data)}') classifier = pickleFiles[idx - 1].split('_')[0] logging.info('sending to tester')
help="The path to output the train data") parser.add_argument('--val_output', type=str, default="../data/val.raw", help="The path to output the validation data") parser.add_argument('--max_train_size', type=int, default=1e6) parser.add_argument('--max_val_size', type=int, default=0) args = parser.parse_args() if not (os.path.isfile(args.msg_path)): print("Downloading from gitter...") download_messages(args.gitter_token, args.chat_room, args.msg_path) with open(args.msg_path, 'r') as input: print("Loading messages form disk...") messages = json.load(input) preprocessor = Preprocessor() print("Preprocessing...") for idx, message in enumerate(messages): messages[idx]['text'] = preprocessor.process_text(message['text']) messages[idx]['fromUser']['username'] = preprocessor.process_text(message['fromUser']['username'], newline=False) encoder = Encoder() if args.encoding_file is None: print("Generating encoding dictionary...") encoder.gen_dict(msg2txt(messages)) encoder.save_enc_dict_json(path='../data/encoding.json') encoder.save_dec_dict_binary(path='../data/encoding.raw') else: print("Loading encoding dictionary from disk...") encoder.load_dict(args.encoding_file)
def get_preprocessors(lang_in, data_cfg, model_cfg): preproc = Preprocessor(lang_in, data_cfg["train_set"], select_preprocessor_features(model_cfg, data_cfg)) train_ldr = make_loader(lang_in, data_cfg["train_set"], preproc, batch_size) dev_ldr = make_loader(lang_in, data_cfg["dev_set"], preproc, batch_size) return preproc, train_ldr, dev_ldr
def get_data(data_dir, source, target, source_train_path, target_train_path, source_extension, target_extension, height, width, batch_size, re=0, workers=8): dataset = DA(data_dir, source, target, source_train_path, target_train_path, source_extension, target_extension) normalizer = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) source_num_classes = dataset.num_source_train_ids train_transformer = T.Compose([ T.RandomSizedRectCrop(height, width), T.RandomHorizontalFlip(), T.ToTensor(), normalizer, T.RandomErasing(EPSILON=re), ]) test_transformer = T.Compose([ T.Resize((height, width), interpolation=3), T.ToTensor(), normalizer, ]) source_train_loader = DataLoader(Preprocessor( dataset.source_train, root=osp.join(dataset.source_images_dir, dataset.source_train_path), transform=train_transformer), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=False, drop_last=True) target_train_loader = DataLoader(Preprocessor( dataset.target_train, root=osp.join(dataset.target_images_dir, dataset.target_train_path), transform=train_transformer), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=False, drop_last=True) # source_train_loader = DataLoader( # UnsupervisedCamStylePreprocessor(dataset.source_train, root=osp.join(dataset.source_images_dir, dataset.source_train_path), # camstyle_root=osp.join(dataset.source_images_dir, dataset.source_train_path), # transform=train_transformer), # batch_size=batch_size, num_workers=0, # shuffle=True, pin_memory=False, drop_last=True) # target_train_loader = DataLoader( # UnsupervisedCamStylePreprocessor(dataset.target_train, # root=osp.join(dataset.target_images_dir, dataset.target_train_path), # camstyle_root=osp.join(dataset.target_images_dir, # dataset.target_train_camstyle_path), # num_cam=dataset.target_num_cam, transform=train_transformer), # batch_size=batch_size, num_workers=workers, # shuffle=True, pin_memory=True, drop_last=True) query_loader = DataLoader(Preprocessor(dataset.query, root=osp.join( dataset.target_images_dir, dataset.query_path), transform=test_transformer), batch_size=batch_size, num_workers=workers, shuffle=False, pin_memory=True) gallery_loader = DataLoader(Preprocessor(dataset.gallery, root=osp.join( dataset.target_images_dir, dataset.gallery_path), transform=test_transformer), batch_size=batch_size, num_workers=workers, shuffle=False, pin_memory=True) return dataset, source_num_classes, source_train_loader, target_train_loader, query_loader, gallery_loader
import os import subprocess from utils.preprocessor import Preprocessor from utils.test_set_splitter import main DATA_FOLDER = "data" AGGREGATED = "aggregated" DATA_SET = "data_set" TEST_SET = "test_set" if __name__ == "__main__": listdir = os.listdir() if not DATA_FOLDER in listdir: subprocess.run(["./download_dataset.sh"]) if not AGGREGATED in listdir: os.mkdir(AGGREGATED) Preprocessor.aggregateData() if not DATA_SET in listdir or not TEST_SET in listdir: main()
logging.info(f'Is to remove stopwords? {remove_stopwords}') logging.info(f'Is to remove POS categories? {remove_pos}') logging.info(f'POS categories to keep: {posCategories}') data_string = json.load(open(original_data_path, READ_MODE)) logging.info(f'Total of original documents: {len(data_string)}') original_data_frame = pd.DataFrame.from_dict(data_string) logging.info(original_data_frame.head()) data = np.array(original_data_frame[field_of_interest], dtype='object') processor = Preprocessor(posCategories, logger=logging.info, language=lang, lemmatize_activated=lemmatize_activated, remove_pos=remove_pos, remove_stopwords=remove_stopwords) processed_data, stopwords = processor.preprocess(data, stopwords_file) del data logging.info(f'Size of data after preprocessing: {len(processed_data)}') df_after_preprocessing = original_data_frame.assign(body=processed_data) df_after_preprocessing = df_after_preprocessing[ df_after_preprocessing['body'].map(lambda field: len(field)) > 0] logging.info( f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}'
def preprocessor(): metadata_loader = MetadataLoader("D:\\shared/birdsong-recognition") return Preprocessor(metadata_loader)