Exemplo n.º 1
0
def run_fasttext_setting(setting_file, home_dir,
                         train_data_file, test_data_file,
                         overwrite_params=None):
    properties = exp_util.load_properties(setting_file)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties,
                                                          overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if dnn_embedding_file.endswith('none'):
        dnn_embedding_file = None

    ######## dnn #######
    print("loading dataset...")
    df, train_size, test_size = exp_util.\
        load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    class_col = int(exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map


    dnn_classifier.fit_fasttext_holdout(df=df,
                                split_at_row=train_size,
                                class_col=class_col,
                                outfolder=outfolder,
                                task=exp_util.describe_task(properties, overwrite_params, setting_file),
                                text_norm_option=1,
                                text_input_info=input_text_info,
                                embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 2
0
def run_fasttext_model(setting_file: str, properties: dict, df: numpy.ndarray,
                       y, train_size: int, class_col: int, outfolder: str,
                       dnn_embedding_file, text_field_mapping: dict):

    # this is the folder to save output to

    print("\n" + str(datetime.datetime.now()))

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("text_fieldnames", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = text_field_mapping[config[0]]
        map["text_length"] = int(config[1])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    dnn_classifier.fit_fasttext_holdout(df=df,
                                        split_at_row=train_size,
                                        class_col=class_col,
                                        outfolder=outfolder,
                                        task=exp_util.describe_task(
                                            properties, overwrite_params,
                                            setting_file),
                                        text_norm_option=1,
                                        text_input_info=input_text_info,
                                        embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 3
0
def run_dnn_setting(setting_file, home_dir,
                    train_data_file, test_data_file,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        word_weights = load_word_weights(word_weights_file)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties,
                                                          overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file)

    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details

    model_descriptors = [
        "input=2d bilstm=100-False|dense=?-softmax|emb",
        #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb",
        "input=2d han_2dinput"]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######
    print("loading dataset...")
    df, train_size, test_size = exp_util.\
        load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    class_col = int(exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))


    print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        model_descriptor = model_descriptor.split(" ")[1]

        dnn_branches = []
        dnn_branch_input_shapes = []
        input_text_info = {}
        count = 0
        for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"):
            config = x.split(",")
            map = {}
            map["text_col"] = config[0]
            map["text_length"] = int(config[2])
            map["text_dim"] = util.DNN_EMBEDDING_DIM
            input_text_info[count] = map
            dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"],
                                                          util.DNN_EMBEDDING_DIM,
                                                          model_descriptor=model_descriptor
                                                          )
            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            count += 1
        # now create DNN branches based on the required input text column sources

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model..."+str(datetime.datetime.now()))

        setting=os.path.splitext(os.path.basename(setting_file))[0]
        setting=setting[setting.index("_")+1]
        desc = 'setting=' + setting
        desc += '|embedding='
        desc += os.path.splitext(os.path.basename(
            exp_util.load_setting('embedding_file', properties, overwrite_params)))[0]

        dnn_classifier.fit_dnn_holdout(df=df,
                               split_at_row=train_size,
                               class_col=class_col,
                               final_model=final_model,
                               outfolder=outfolder,
                               task=desc,
                               model_descriptor=model_descriptor, text_norm_option=1,
                               text_input_info=input_text_info,
                               embedding_model=emb_model,
                               embedding_model_format=embedding_format,
                                       word_weights=word_weights)
    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 4
0
def run_cml_setting(setting_file, home_dir,
                    train_data_file, test_data_file,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting('training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties,
                                                          overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    # print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file)

    print("loading dataset...")
    df, train_size, test_size = exp_util.\
        load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    class_col = int(exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=1,
                                               emb_format=embedding_format,
                                               emb_model=emb_model,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all=numpy.concatenate(X_all, axis=1)

    setting_file = setting_file[setting_file.rfind("/") + 1:]
    models=["svm_l"]
    for model_name in models:
        print("\tML model=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file, model_name, X_all[0:train_size,:], y[0:train_size], outfolder,
                           categorical_targets=target_classes,
                           nfold=None, algorithms=[model_name])
        trained_model=cls.run()["svm_l"]
        cls.eval_holdout(trained_model, model_name, X_all[train_size:,:],y[train_size:] )

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 5
0
def run_cml_setting(setting_file,
                    home_dir,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties,
                                              overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        print("using word weights to revise embedding vectors")
        word_weights = load_word_weights(word_weights_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    # print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format,
                                              dnn_embedding_file)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()
    class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=0,
                                               emb_format=embedding_format,
                                               emb_model=emb_model,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all = numpy.concatenate(X_all, axis=1)

    setting_file = setting_file[setting_file.rfind("/") + 1:]
    models = ["svm_l"]
    for model_name in models:
        print("\tML model=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file,
                            model_name,
                            X_all,
                            y,
                            outfolder,
                            categorical_targets=target_classes,
                            nfold=n_fold,
                            algorithms=[model_name])
        cls.run()

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 6
0
def run_fasttext_setting(setting_file, home_dir, overwrite_params=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if dnn_embedding_file.endswith('none'):
        dnn_embedding_file = None

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()
    class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

    dnn_classifier.fit_fasttext(df=df,
                                nfold=n_fold,
                                class_col=class_col,
                                outfolder=outfolder,
                                task=exp_util.describe_task(
                                    properties, overwrite_params,
                                    setting_file),
                                text_norm_option=1,
                                text_input_info=input_text_info,
                                embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 7
0
def run_mtl_setting(setting_file,
                    home_dir,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties,
                                              overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        print("using word weights to revise embedding vectors...")
        word_weights = load_word_weights(word_weights_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    # print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format,
                                              dnn_embedding_file)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        "input=2d bilstm=100-False|dense=?-softmax|emb",
        # "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb",
        "input=2d han_2dinput"
    ]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()

    #stats about main task
    maintask_class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    main_y = df[:, maintask_class_col]
    target_classes = len(set(main_y))
    print("\ttotal classes=" + str(target_classes))

    #stats about auxiliary tasks
    auxtask_class_col = exp_util.load_setting("class_auxiliary", properties,
                                              overwrite_params)
    if auxtask_class_col == None:
        print("Not MTL, quit.")
        exit(1)
    auxtask_class_cols = []
    aux_classes = []
    for i in auxtask_class_col.split(","):
        i = int(i)
        aux_y = df[:, i]
        aux_cls = len(set(aux_y))
        print("\t\t auxiliary task with classes=" + str(aux_classes))
        auxtask_class_cols.append(i)
        aux_classes.append(aux_cls)

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML shared model=" + model_descriptor)

        model_descriptor = model_descriptor.split(" ")[1]

        dnn_branches = []
        dnn_branch_input_shapes = []
        input_text_info = {}
        count = 0
        for x in exp_util.load_setting("training_text_data_columns",
                                       properties,
                                       overwrite_params).split("|"):
            config = x.split(",")
            map = {}
            map["text_col"] = config[0]
            map["text_length"] = int(config[2])
            map["text_dim"] = util.DNN_EMBEDDING_DIM
            input_text_info[count] = map
            dnn_branch = dnn_classifier.create_dnn_branch(
                map["text_length"],
                util.DNN_EMBEDDING_DIM,
                model_descriptor=model_descriptor)
            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            count += 1
        # now create DNN branches based on the required input text column sources

        print("creating MTL model (if multiple input branches)")
        final_model = \
            mtl_classifier.create_mtl_layers(dnn_branches, dnn_branch_input_shapes,
                                            target_classes, aux_classes)
        print("fitting model...")

        mtl_classifier.fit_dnn_mtl(df=df,
                                   nfold=n_fold,
                                   main_class_col=maintask_class_col,
                                   aux_class_cols=auxtask_class_cols,
                                   final_model=final_model,
                                   outfolder=outfolder,
                                   task=exp_util.describe_task(
                                       properties, overwrite_params,
                                       setting_file),
                                   model_descriptor=model_descriptor,
                                   text_norm_option=1,
                                   text_input_info=input_text_info,
                                   embedding_model=emb_model,
                                   embedding_model_format=embedding_format,
                                   word_weights=word_weights)
    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 8
0
def run_single_setting(setting_file,
                       home_dir,
                       remove_rare_classes,
                       remove_no_desc_instances,
                       overwrite_params=None,
                       gensimFormat=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)
    # this is the folder containing other numeric features that are already pre-extracted
    csv_training_other_feaures = home_dir + exp_util.load_setting(
        'training_other_features', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if gensimFormat is None:
        gensimFormat = ".gensim" in dnn_embedding_file
    if gensimFormat:
        pretrained_embedding_models = gensim.models.KeyedVectors.load(
            dnn_embedding_file, mmap='r')
    else:
        pretrained_embedding_models = gensim.models.KeyedVectors. \
            load_word2vec_format(dnn_embedding_file, binary=True)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        #"input=2d bilstm=100-False|dense=?-softmax|emb",
        "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb"
    ]
    # "input=2d han_2dinput"]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    ).as_matrix()
    df.astype(str)
    if remove_no_desc_instances:
        print(
            "you have chosen to remove instances whose description are empty")
        df = exp_util.remove_empty_desc_instances(df, 5)

    y = df[:,
           int(
               exp_util.
               load_setting("class_column", properties, overwrite_params))]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    remove_instance_indexes = []
    if remove_rare_classes:
        print(
            "you have chosen to remove classes whose instances are less than n_fold"
        )
        instance_labels = list(y)
        class_dist = {x: instance_labels.count(x) for x in instance_labels}
        remove_labels = []
        for k, v in class_dist.items():
            if v < n_fold:
                remove_labels.append(k)
        remove_instance_indexes = []
        for i in range(len(y)):
            label = y[i]
            if label in remove_labels:
                remove_instance_indexes.append(i)
        y = numpy.delete(y, remove_instance_indexes)
        target_classes = len(set(y))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        input_shape = model_descriptor.split(" ")[0]
        model_descriptor = model_descriptor.split(" ")[1]

        if input_shape.endswith("2d"):
            input_as_2D = True
        else:
            input_as_2D = False

        if "han" in model_descriptor or "lstm" in model_descriptor:
            dnn_embedding_mask_zero = True
        else:
            dnn_embedding_mask_zero = False

        input_column_sources = \
            [x for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|")]
        # now create DNN branches based on the required input text column sources

        dnn_branches = []
        dnn_branch_input_shapes = []
        dnn_branch_input_features = []
        for string in input_column_sources:
            print("\tcreating model branch=" + string)
            config = string.split(",")
            col_index = config[0]

            embedding_trainable = False
            if col_index == '13':
                embedding_trainable = True

            text_data = cc.create_text_input_data(config[0], df)

            col_text_length = int(config[2])

            text_data = numpy.delete(text_data, remove_instance_indexes)
            data = ["" if type(x) is float else str(x) for x in text_data]

            dnn_branch = dnn_classifier.create_dnn_branch_textinput(
                pretrained_embedding_models,
                input_text_data=data,
                input_text_sentence_length=col_text_length,
                input_text_word_embedding_dim=util.DNN_EMBEDDING_DIM,
                model_descriptor=model_descriptor,
                embedding_trainable=embedding_trainable,
                embedding_mask_zero=dnn_embedding_mask_zero)

            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            dnn_branch_input_features.append(dnn_branch[2])

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model...")
        dnn_classifier.fit_dnn(inputs=dnn_branch_input_features,
                               nfold=n_fold,
                               y_train=y,
                               final_model=final_model,
                               outfolder=outfolder,
                               task=exp_util.describe_task(
                                   properties, overwrite_params, setting_file),
                               model_descriptor=model_descriptor)
        print("Completed running all models on this setting file")
        print(datetime.datetime.now())
Exemplo n.º 9
0
def run_setting(
        setting_file,
        home_dir,
        train_data_file,
        test_data_file,
        model_choice,  #dnn - including cnn,bilstm,han; cml -svm ; fasttext-fasttext
        dataset_type: str,  #mwpd, wdc, rakuten, icecat
        dataset_text_field_mapping: dict,
        overwrite_params=None,
        embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties,
                                              overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        print("using word weights to revise embedding vectors...")
        word_weights = load_word_weights(word_weights_file)

    print("loading dataset...")
    if dataset_type == "mwpd":
        df, train_size, test_size = exp_util. \
            load_and_merge_train_test_data_jsonMPWD(train_data_file, test_data_file)
    elif dataset_type == "rakuten":
        df, train_size, test_size = exp_util. \
            load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    elif dataset_type == "icecat":
        df, train_size, test_size = exp_util. \
            load_and_merge_train_test_data_jsonIceCAT(train_data_file, test_data_file)
    elif dataset_type == "fakerev":
        df, train_size, test_size = exp_util. \
            load_and_merge_train_test_data_productfakerev(train_data_file, test_data_file)
    else:  #wdc
        df, train_size, test_size = exp_util. \
            load_and_merge_train_test_data_jsonWDC(train_data_file, test_data_file)

    #numpy.nan_to_num(df)

    class_fieldname = exp_util.load_setting("class_fieldname", properties,
                                            overwrite_params)
    class_col = dataset_text_field_mapping[class_fieldname]
    y = df[:, class_col]

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = exp_util.load_setting("embedding_file", properties,
                                               overwrite_params)
    if dnn_embedding_file is not None and dnn_embedding_file.lower() != 'none':
        dnn_embedding_file = home_dir + dnn_embedding_file  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
        print("\t will use this embedding data model: " + dnn_embedding_file)

    # print("embedding file is========="+dnn_embedding_file)
    if embedding_format == 'none':
        emb_model = None
    else:
        emb_model = embedding_util.load_emb_model(embedding_format,
                                                  dnn_embedding_file)

    if model_choice == 'dnn':
        run_dnn_models(properties, df, y, train_size, class_col, outfolder,
                       emb_model, embedding_format, word_weights,
                       dataset_text_field_mapping)
    elif model_choice == 'cml':
        run_cml_models(setting_file, properties, df, y, train_size, class_col,
                       outfolder, dnn_embedding_file, emb_model,
                       embedding_format, dataset_text_field_mapping)
    else:
        run_fasttext_model(setting_file, properties, df, y, train_size,
                           class_col, outfolder, dnn_embedding_file,
                           dataset_text_field_mapping)
Exemplo n.º 10
0
def run_cml_models(setting_file: str, properties: dict, df: numpy.ndarray, y,
                   train_size: int, class_col: int, out_folder: str,
                   embeddingmodel_file: str, embeddingmodel, embeddingformat,
                   text_field_mapping: dict):
    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("text_fieldnames", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = text_field_mapping[config[0]]
        map["text_length"] = int(config[1])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        # print(v)
        # if v["text_col"]==5:
        #     print("here")
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=0,
                                               emb_format=embeddingformat,
                                               emb_model=embeddingmodel,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all = numpy.concatenate(X_all, axis=1)

    X_train = X_all[0:train_size]
    X_test = X_all[train_size:]
    y_train = y[0:train_size]
    y_test = y[train_size:]

    setting_file = setting_file[setting_file.rfind("/") + 1:]

    models = ["svm_l"]
    for model_name in models:
        identifier = model_name + "|" + embeddingmodel_file[embeddingmodel_file
                                                            .rfind("/") + 1:]
        print("\tML model and embedding=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file,
                            identifier,
                            X_train,
                            y_train,
                            out_folder,
                            categorical_targets=y,
                            nfold=None,
                            algorithms=[model_name])
        trained_model = cls.run()[model_name]
        cls.eval_holdout(trained_model, model_name, X_test, y_test)

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Exemplo n.º 11
0
def run_dnn_models(properties: dict, df: numpy.ndarray, y, train_size: int,
                   class_col: int, out_folder: str, embeddingmodel,
                   embeddingformat, word_weights: list,
                   text_field_mapping: dict):
    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        "input=2d bilstm=100-False|dense=?-softmax|emb",
        #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb",
        "input=2d han_2dinput"
    ]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        model_descriptor = model_descriptor.split(" ")[1]

        dnn_branches = []
        dnn_branch_input_shapes = []
        input_text_info = {}
        count = 0
        for x in exp_util.load_setting("text_fieldnames", properties,
                                       overwrite_params).split("|"):
            config = x.split(",")
            map = {}

            map["text_col"] = text_field_mapping[config[0]]
            map["text_length"] = int(config[1])
            map["text_dim"] = util.DNN_EMBEDDING_DIM
            input_text_info[count] = map

            # if config[1] == 'simple':
            #     dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"],
            #                                                   util.DNN_EMBEDDING_DIM,
            #                                                   model_descriptor='simple'
            #                                                   )
            # else:
            dnn_branch = dnn_classifier.create_dnn_branch(
                map["text_length"],
                util.DNN_EMBEDDING_DIM,
                model_descriptor=model_descriptor)
            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            count += 1
        # now create DNN branches based on the required input text column sources

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model...")

        dnn_classifier.fit_dnn_holdout(df=df,
                                       split_at_row=train_size,
                                       class_col=class_col,
                                       final_model=final_model,
                                       outfolder=out_folder,
                                       task=exp_util.describe_task(
                                           properties, overwrite_params,
                                           setting_file),
                                       model_descriptor=model_descriptor,
                                       text_norm_option=1,
                                       text_input_info=input_text_info,
                                       embedding_model=embeddingmodel,
                                       embedding_model_format=embeddingformat,
                                       word_weights=word_weights)
    print("Completed running all models on this setting file")
    print(datetime.datetime.now())