def run_fasttext_setting(setting_file, home_dir, train_data_file, test_data_file, overwrite_params=None): properties = exp_util.load_properties(setting_file) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if dnn_embedding_file.endswith('none'): dnn_embedding_file = None ######## dnn ####### print("loading dataset...") df, train_size, test_size = exp_util.\ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") class_col = int(exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_classifier.fit_fasttext_holdout(df=df, split_at_row=train_size, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task(properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_fasttext_model(setting_file: str, properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, outfolder: str, dnn_embedding_file, text_field_mapping: dict): # this is the folder to save output to print("\n" + str(datetime.datetime.now())) target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 dnn_classifier.fit_fasttext_holdout(df=df, split_at_row=train_size, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_dnn_setting(setting_file, home_dir, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: print("using word weights to revise embedding vectors...") word_weights = load_word_weights(word_weights_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" #print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ "input=2d bilstm=100-False|dense=?-softmax|emb", #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb", "input=2d han_2dinput" ] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) model_descriptor = model_descriptor.split(" ")[1] dnn_branches = [] dnn_branch_input_shapes = [] input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map if config[1] == 'simple': dnn_branch = dnn_classifier.create_dnn_branch( map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor='simple') else: dnn_branch = dnn_classifier.create_dnn_branch( map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) count += 1 # now create DNN branches based on the required input text column sources print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model...") dnn_classifier.fit_dnn(df=df, nfold=n_fold, class_col=class_col, final_model=final_model, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor, text_norm_option=1, text_input_info=input_text_info, embedding_model=emb_model, embedding_model_format=embedding_format, word_weights=word_weights) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_fasttext_setting(setting_file, home_dir, overwrite_params=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if dnn_embedding_file.endswith('none'): dnn_embedding_file = None n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_classifier.fit_fasttext(df=df, nfold=n_fold, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_single_setting(setting_file, home_dir, remove_rare_classes, remove_no_desc_instances, overwrite_params=None, gensimFormat=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder containing other numeric features that are already pre-extracted csv_training_other_feaures = home_dir + exp_util.load_setting( 'training_other_features', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if gensimFormat is None: gensimFormat = ".gensim" in dnn_embedding_file if gensimFormat: pretrained_embedding_models = gensim.models.KeyedVectors.load( dnn_embedding_file, mmap='r') else: pretrained_embedding_models = gensim.models.KeyedVectors. \ load_word2vec_format(dnn_embedding_file, binary=True) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ #"input=2d bilstm=100-False|dense=?-softmax|emb", "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb" ] # "input=2d han_2dinput"] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ).as_matrix() df.astype(str) if remove_no_desc_instances: print( "you have chosen to remove instances whose description are empty") df = exp_util.remove_empty_desc_instances(df, 5) y = df[:, int( exp_util. load_setting("class_column", properties, overwrite_params))] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) remove_instance_indexes = [] if remove_rare_classes: print( "you have chosen to remove classes whose instances are less than n_fold" ) instance_labels = list(y) class_dist = {x: instance_labels.count(x) for x in instance_labels} remove_labels = [] for k, v in class_dist.items(): if v < n_fold: remove_labels.append(k) remove_instance_indexes = [] for i in range(len(y)): label = y[i] if label in remove_labels: remove_instance_indexes.append(i) y = numpy.delete(y, remove_instance_indexes) target_classes = len(set(y)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) input_shape = model_descriptor.split(" ")[0] model_descriptor = model_descriptor.split(" ")[1] if input_shape.endswith("2d"): input_as_2D = True else: input_as_2D = False if "han" in model_descriptor or "lstm" in model_descriptor: dnn_embedding_mask_zero = True else: dnn_embedding_mask_zero = False input_column_sources = \ [x for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|")] # now create DNN branches based on the required input text column sources dnn_branches = [] dnn_branch_input_shapes = [] dnn_branch_input_features = [] for string in input_column_sources: print("\tcreating model branch=" + string) config = string.split(",") col_index = config[0] embedding_trainable = False if col_index == '13': embedding_trainable = True text_data = cc.create_text_input_data(config[0], df) col_text_length = int(config[2]) text_data = numpy.delete(text_data, remove_instance_indexes) data = ["" if type(x) is float else str(x) for x in text_data] dnn_branch = dnn_classifier.create_dnn_branch_textinput( pretrained_embedding_models, input_text_data=data, input_text_sentence_length=col_text_length, input_text_word_embedding_dim=util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor, embedding_trainable=embedding_trainable, embedding_mask_zero=dnn_embedding_mask_zero) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) dnn_branch_input_features.append(dnn_branch[2]) print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model...") dnn_classifier.fit_dnn(inputs=dnn_branch_input_features, nfold=n_fold, y_train=y, final_model=final_model, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_dnn_models(properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, out_folder: str, embeddingmodel, embeddingformat, word_weights: list, text_field_mapping: dict): # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ "input=2d bilstm=100-False|dense=?-softmax|emb", #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb", "input=2d han_2dinput" ] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) model_descriptor = model_descriptor.split(" ")[1] dnn_branches = [] dnn_branch_input_shapes = [] input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map # if config[1] == 'simple': # dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"], # util.DNN_EMBEDDING_DIM, # model_descriptor='simple' # ) # else: dnn_branch = dnn_classifier.create_dnn_branch( map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) count += 1 # now create DNN branches based on the required input text column sources print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model...") dnn_classifier.fit_dnn_holdout(df=df, split_at_row=train_size, class_col=class_col, final_model=final_model, outfolder=out_folder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor, text_norm_option=1, text_input_info=input_text_info, embedding_model=embeddingmodel, embedding_model_format=embeddingformat, word_weights=word_weights) print("Completed running all models on this setting file") print(datetime.datetime.now())