def __init__(self, data_name="data", feature_name="f1.f2", target_name="t", num_crossvalidation=10, random_state=312018, test_split=None, replace_exists=False, logger=None): self.logger = logger or File_Logger_Helper.get_logger(logger_fname="data_preprocessing") self.model_name = "" self.data_name= data_name self.feature_name= feature_name self.target_name= target_name self.num_crossvalidation= num_crossvalidation self.random_state= random_state self.test_split= test_split self.replace_exists= replace_exists self.y_train = None self.X_test= None self.y_test = None self.kfold = None self.kod = 0 self.load_data_if_exists()
def __init__(self, dump_model_dir, filter_stopword=True, use_stemm=False, label_dict=None, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="bow_logistic_regression") self.classifier_name = "logistic_regression_bow" self.use_stem = use_stemm self.stemmer = PorterStemmer() self.filter_stopword = filter_stopword self.stopwrods = stopwords.words('english') self.label_dict = label_dict if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) self.dump_model_fname = os.path.join( dump_model_dir, "bow_logistic_regression_model.pickle") self.dump_dictionary_fname = os.path.join( dump_model_dir, "bow_logistic_regression_dictionary.pickle") self.dump_label_encoder_fname = os.path.join( dump_model_dir, "bow_logistic_regression_label_encoder.pickle") self.model = None self.dictionary = None self.label_encoder = None self.load_model()
def __init__(self, dump_model_dir, train_file=None, model_name="gen", test_size=None, cv=10, normalize=False, is_bool_value=False, is_percentage=False, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="linear_regression") self.classifier_name = "linear_regression" if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) if test_size == None: general_model_name = "lassocv{}_regression".format(cv) else: general_model_name = "lasso_regression_split{}".format(test_size) self.dump_model_fname = os.path.join( dump_model_dir, "{}_{}_model.pickle".format(model_name, general_model_name)) self.dump_standard_scaler_fname = os.path.join( dump_model_dir, "{}_{}_standard_scaler.pickle".format(model_name, general_model_name)) if model_name is None and train_file is not None: model_name = re.sub(r"\.\w+$", "", basename(train_file)) self.model_name = "{}_{}".format(model_name, general_model_name) self.out_coef_fname = os.path.join( config.ML_OUTPUT_DIR, "{}_coeficient.csv".format(self.model_name)) self.model = None self.standard_scaler = None self.load_model(train_file, test_size, cv, normalize, is_bool_value, is_percentage)
def __init__(self, embedding_fname=os.path.join(config.WORD_EMBEDDING_DIR, 'glove.6B.100d.txt'), embedding_name="token_vector", num_words: int = 10000, embedding_vector_dimension: int = 100, max_text_len: int = 100, data_name="data", feature_name="f1.f2", target_name="t", replace_exists=False, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="word_embedding") self.embedding_fname = embedding_fname self.embedding_name = embedding_name self.num_words = num_words self.embedding_vector_dimension = embedding_vector_dimension self.max_text_len = max_text_len self.data_name = data_name self.feature_name = feature_name self.target_name = target_name self.replace_exists = replace_exists self.tokenizer = None self.embedding_matrix = None self.dump_model_dir = config.PREROCESS_PICKLES_DIR self.load_init_model()
def __init__(self, dump_model_dir, train_file=None, is_bool_value=False, standardize=False, model_name="gen", logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="linear_regression") self.classifier_name = "linear_regression" if not os.path.exists(dump_model_dir): os.makedirs(dump_model_dir) general_model_name = "lassocv10_linear_regression" self.dump_model_fname = os.path.join( dump_model_dir, "{}_{}_model.pickle".format(model_name, general_model_name)) self.dump_dictionary_fname = os.path.join( dump_model_dir, "{}_{}_dictionary.pickle".format(model_name, general_model_name)) self.dump_standard_scaler_fname = os.path.join( dump_model_dir, "{}_{}_standard_scaler.pickle".format(model_name, general_model_name)) self.out_coef_fname = os.path.join( config.ML_OUTPUT_DIR, "{}_{}_coeficient.csv".format(model_name, general_model_name)) self.model_name = "{}_{}".format(model_name, general_model_name) self.model = None self.dictionary = None self.standard_scaler = None self.load_model(train_file, is_bool_value, standardize)
def __init__(self, unique_name="feature_preprocess", is_sparse: bool = True, standardize: bool = False, one_hot_encode: bool = False, convert_bool: bool = False, convert_row_percentage: bool = False, normalize_text: bool = True, use_stem: bool = False, min_tokens: int = None, bag_of_word: bool = False, max_token_number: int = None, counter_ngram: int = None, embedding: bool = False, sentence_size_percentage: float = 1, min_word_freq: int = 1, data_name="data", feature_name="f1.f2", target_name="t", replace_exists=False, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="preprocessing") self.stemmer = SnowballStemmer('english') self.unique_name = unique_name self.is_sparse = is_sparse self.standardize = standardize self.one_hot_encode = one_hot_encode self.convert_bool = convert_bool self.convert_row_percentage = convert_row_percentage self.normalize_text = normalize_text self.use_stem = use_stem self.min_tokens: int = min_tokens self.bag_of_word = bag_of_word self.max_token_number = max_token_number self.counter_ngram = counter_ngram self.embedding = embedding self.sentence_size_percentage: float = sentence_size_percentage self.min_word_freq = 1, # Fixme: we should use dictionary to save the features to vector self.standard_scaler = None self.one_hot_encoder = None self.dictionary = None self.counter_vector = None self.vocab_processor = None self.label_encoder = None self.replace_exists = replace_exists self.load_model_if_exists() # TODO: FIX ME in the future........... # self.data_lable_name = "fp_{}_{}_{}".format(data_name, feature_name, target_name) self.data_lable_name = "fp_yes_no".format(data_name, feature_name, target_name) self.load_label_model()
def __init__(self, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="training") self.preprocessor = Feature_Processing() self.model_name = None self.model = None
def __init__(self, y_gold: list, y_pred: list, X_gold: pd.Series = None, is_multi_class=False, logger=None): # Please note that the list of gold and predict should have the original label when they pass in. self.X_gold = X_gold self.y_gold = y_gold self.y_pred = y_pred self.is_multi_class = is_multi_class if is_multi_class is True: self.class_names = list(range(self.y_gold.shape[1])) else: self.class_names = list(set(self.y_gold + self.y_pred)) self.logger = logger or File_Logger_Helper.get_logger( logger_fname="evaluate.log")
def __init__(self, setting_file_list, X, y, multi_class=None, data_name="data", text_feature_name_list=("f1", "f2"), other_feature_name_list=("f1", "f2"), target_name_list=("t1", "t2"), num_crossvalidation=10, random_state=312018, test_split=None, evaluate_dir=os.path.join(config.EVALUATE_DATA_DIR, "crossv10"), predict_dir=os.path.join(config.EVALUATE_DATA_DIR, "crossv10"), logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="exp") self.X = X self.y = y self.y = y self.multi_class = multi_class self.data_name = data_name self.text_feature_name_list = text_feature_name_list self.other_feature_name_list = other_feature_name_list self.feature_name = [] if text_feature_name_list is not None: self.feature_name += text_feature_name_list if other_feature_name_list is not None: self.feature_name += other_feature_name_list self.target_name_list = target_name_list self.num_crossvalidation = num_crossvalidation self.random_state = random_state self.test_split = test_split # self.X_unlable = X_unlable self.evaluate_dir = evaluate_dir self.predict_dir = predict_dir self.mid = 0 self.cv_evaluate_fname = os.path.join( evaluate_dir, "crossv10_{}.csv".format(time.time())) self.setting_file_list = setting_file_list
def __init__(self, logger=None): self.logger = logger or File_Logger_Helper.get_logger(logger_fname="cv.log") self.mid_list = [] self.roc_auc_list = [] self.accuracy_list = [] self.micro_precision_list = [] self.micro_recall_list = [] self.micro_f1_list = [] self.macro_precision_list = [] self.macro_recall_list = [] self.macro_f1_list = [] self.weighted_precision_list = [] self.weighted_recall_list = [] self.weighted_f1_list = [] self.best_macro_f1 = None self.best_macro_f1_mid = None self.best_micro_f1 = None self.best_micro_f1_mid = None self.best_weighted_f1 = None self.best_weighted_f1_mid = None
def __init__(self, classifier_name="cnn", num_words=10000, max_text_len=1600, embedding_vector_dimension=100, glove_fname=os.path.join(config.WORD_EMBEDDING_DIR, 'glove.6B.100d.txt'), data_name="data", feature_name="f1.f2", target_name="t", num_class=1, kernel_initializer='glorot_uniform', num_lstm_layer=5, drop_perc=0.1, learning_rate=1e-3, weight_decate_rate=0.7, l2_constraint=0, batch_size=100, epochs=10, logger=None): self.logger = logger or File_Logger_Helper.get_logger( logger_fname="CNN.log") self.feature_preprocessing = Feature_Processing() self.classifier_name = classifier_name self.num_class = num_class self.kernel_initializer = kernel_initializer self.num_words = num_words self.num_steps = max_text_len self.num_lstm_layer = num_lstm_layer self.drop_perc = drop_perc self.learning_rate = learning_rate self.weight_decate_rate = weight_decate_rate self.weight_decay = 1e-4 self.l2_constraint = l2_constraint self.batch_size = batch_size self.epochs = epochs self.model = None # Initial the embedding layer. if glove_fname is not None: self.embedding_helper = Word_Embedding(embedding_fname=glove_fname) if embedding_vector_dimension != self.embedding_helper.embedding_vector_dimension: self.logger.error( "Error, the embedding vector dimension should be {} instead of {}. Fix embedding_vector_dimension to {}" .format( self.embedding_helper.embedding_vector_dimension, embedding_vector_dimension, self.embedding_helper.embedding_vector_dimension, )) self.embedding_vector_dimension = self.embedding_helper.embedding_vector_dimension self.embedding_name = "{}_{}_{}_{}".format( re.sub(r"\.txt", "_", os.path.basename(glove_fname)), data_name, feature_name, target_name) else: # If the embedding is not specified, we would use the plain token vector. self.embedding_helper = Word_Embedding() self.embedding_vector_dimension = embedding_vector_dimension self.embedding_name = "{}_{}_{}_{}".format("token_vector", data_name, feature_name, target_name) preprocess_name = self.embedding_helper.generate_model_name( embedding_name=self.embedding_name, num_words=num_words, embedding_vector_dimension=embedding_vector_dimension, max_text_len=max_text_len) self.load_model_if_exists(classifier_name=classifier_name, preprocess_name=preprocess_name)