Пример #1
0
def __grid_search(remaining_params, current_params, results_dict, train_set,
                  dev_set, kb, embeddings_array, ind2emoji, dataset_name,
                  in_dim, learning_rate, threshold):
    if len(remaining_params) > 0:
        # Get a parameter
        param, values = remaining_params.popitem()

        # For each potential parameter, copy current_params and add the potential parameter to next_params
        for value in values:
            next_params = current_params.copy()
            next_params[param] = value

            # Perform grid search on the remaining params
            __grid_search(remaining_params=remaining_params.copy(),
                          current_params=next_params,
                          results_dict=results_dict,
                          train_set=train_set,
                          dev_set=dev_set,
                          kb=kb,
                          embeddings_array=embeddings_array,
                          ind2emoji=ind2emoji,
                          dataset_name=dataset_name,
                          in_dim=in_dim,
                          learning_rate=learning_rate,
                          threshold=threshold)
    else:
        model_params = ModelParams(in_dim=in_dim,
                                   out_dim=current_params["out_dim"],
                                   max_epochs=current_params["max_epochs"],
                                   pos_ex=current_params["pos_ex"],
                                   neg_ratio=current_params["ratio"],
                                   learning_rate=learning_rate,
                                   dropout=current_params["dropout"],
                                   class_threshold=threshold)

        name = model_params.model_folder(dataset_name)
        # We know that the larger the batch size, the more epochs needed to convergence, therefore we modify the batch
        # size here
        model_params.max_epochs = int(model_params.max_epochs *
                                      math.sqrt(model_params.pos_ex) *
                                      (model_params.neg_ratio + 1))

        results_dict[name] = train_save_evaluate(
            params=model_params,
            train_set=train_set,
            dev_set=dev_set,
            kb=kb,
            embeddings_array=embeddings_array,
            ind2emoji=ind2emoji,
            dataset_name=dataset_name)

    return results_dict
Пример #2
0
 def __run_fold(self, fold: DataPaths):
     conf = self.get_subconfig("input")
     padding: int = conf.get("padding")
     train: DataSet = DataSet(fold.train_path,
                              fold.meta_path,
                              padding=padding)
     test: DataSet = DataSet(fold.test_path,
                             fold.meta_path,
                             padding=padding)
     valid: DataSet = DataSet(fold.valid_path,
                              fold.meta_path,
                              padding=padding) if fold.valid_path else None
     features: List[Feature] = self.__create_features(
         train, self.base_path, fold)
     model_params = ModelParams(**self.get_subconfig("model"))
     model = TaggingModel(features,
                          train.column(self.config["input.target_col"]),
                          model_params)
     model.train(train, valid=valid, **self.get_subconfig("train"))
     pred: TaggingPrediction = model.test(test)
     self.after_fold(model, pred, fold)
     del model, features
Пример #3
0
    return final_sample, final_review


if __name__ == '__main__':

    #Set Global Variables for emoji2vec
    in_dim = 100  # Length of word2vec vectors
    out_dim = 100  # Desired dimension of output vectors
    pos_ex = 4
    neg_ratio = 1
    max_epochs = 40
    dropout = 0.1
    params = ModelParams(in_dim=in_dim,
                         out_dim=out_dim,
                         pos_ex=pos_ex,
                         max_epochs=max_epochs,
                         neg_ratio=neg_ratio,
                         learning_rate=0.001,
                         dropout=dropout,
                         class_threshold=0.5)

    e2v_ours_path = params.model_folder('unicode') + '/emoji2vec_100.bin'

    # Load the FastText word vectors and emoji vectors
    w2v = gs.FastText.load(os.path.join(w2v_path, 'fasttext_model'))
    e2v_ours = gs.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True)
    # Combine the word vectors and emoji vectors together
    p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours)

    # #=========================For the unprocessed text========================================
    tweet_combined_dataframe = utils.read_local_csv_file(
        path=read_data.tweet_combined_path,
Пример #4
0
Файл: main.py Проект: ollpu/qml
# struct.append(([0], 'Y', 1))
struct.append(([1], 'Y', 0))
struct.append(([0], 'Z', 1))
struct.append(([1], 'X', 0))
struct += [
    ([], 'X', 0),
    ([], 'Y', 0),
]

model = Model(2, struct)
# model = Model(1, [
#     ([], 'Y', 0)
# ])

params = UnsetParams()
tparams = ModelParams(model,
                      2 * np.pi * np.random.rand(len(model.structure) + 1))
tparams.params[-1] = np.random.rand(1) - 0.5
cost_evolution = []

for repi in range(5):
    tparams, one_ce = learner.learn(tparams, X, Y, 0.01, 1000)
    cost_evolution.append(one_ce)
    if tparams.cost < params.cost: params = tparams.copy()
    print(tparams.cost)
    tparams.params += np.random.normal(0, 1, tparams.params.shape)
print(params.cost)

export.write_qs(params)

Yc = params.classify(X)
    def __init__(self):
        self.parser = arg.ArgumentParser(
            description="Parser for training/evaluationg emoji2vec model"
        )

        # Directories/files
        self.parser.add_argument(
            "-d",
            "--dir",
            default="./data/training/",
            type=str,
            help="directory for training data",
        )
        self.parser.add_argument(
            "-w",
            "--word",
            default="./data/word2vec/GoogleNews-vectors-negative300.bin.gz",
            type=str,
            help="path to the word2vec file",
        )
        self.parser.add_argument(
            "-m",
            "--mapping",
            default="emoji_mapping.p",
            type=str,
            help="emoji index mapping file",
        )
        self.parser.add_argument(
            "-em",
            "--embeddings",
            default="generated_embeddings.p",
            type=str,
            help="file for generated embeddings",
        )

        # Model parameters
        self.parser.add_argument(
            "-k",
            "--dim",
            default=300,
            type=int,
            help="train a 300 x k projection matrix",
        )
        self.parser.add_argument(
            "-b",
            "--batch",
            default=64,
            type=int,
            help="positive examples in minibatch (total size=batch*(1+ratio)",
        )
        self.parser.add_argument(
            "-e",
            "--epochs",
            default=20,
            type=int,
            help="number of training epochs",
        )
        self.parser.add_argument(
            "-r",
            "--ratio",
            default=1,
            type=int,
            help="ratio of negative examples to positive",
        )
        self.parser.add_argument(
            "-l", "--learning", default=0.001, type=float, help="learning rate"
        )
        self.parser.add_argument(
            "-dr",
            "--dropout",
            default=0.1,
            type=float,
            help="amount of dropout to use",
        )
        self.parser.add_argument(
            "-t",
            "--threshold",
            default=0.5,
            type=float,
            help="threshold for binary classification",
        )

        # Miscellaneous
        self.parser.add_argument(
            "-ds",
            "--dataset",
            default="unicode",
            type=str,
            help="unicode or emojipedia",
        )
        self.parser.add_argument("-D", "--debug", help="enable debugging")

        args = self.parser.parse_args()

        # dimensions of projected embeddings
        self.model_params = ModelParams(
            300,
            out_dim=args.dim,
            pos_ex=args.batch,
            max_epochs=args.epochs,
            neg_ratio=args.ratio,
            learning_rate=args.learning,
            dropout=args.dropout,
            class_threshold=args.threshold,
        )

        # debug mode?
        self.debug = args.debug

        # data folder
        self.data_folder = args.dir

        # file for generated embeddings
        self.embeddings_file = args.embeddings

        # file for emoji mappings
        self.mapping_file = args.mapping

        # word2vec file
        self.word2vec_file = args.word

        # dataset to chose: unicode or emojipedia
        self.dataset = args.dataset
    def __init__(self):
        self.parser = arg.ArgumentParser(
            description='Parser for training/evaluationg emoji2vec model')

        # Directories/files
        self.parser.add_argument('-d',
                                 '--dir',
                                 default='./data/training/',
                                 type=str,
                                 help='directory for training data')
        self.parser.add_argument(
            '-w',
            '--word',
            default='./data/w2v/w2v.twitter.edinburgh10M.400d.txt.word2vec.bin',
            type=str,
            help='path to the word2vec file')
        self.parser.add_argument('-m',
                                 '--mapping',
                                 default='emoji_mapping.p',
                                 type=str,
                                 help='emoji index mapping file')
        self.parser.add_argument('-em',
                                 '--embeddings',
                                 default='generated_embeddings.p',
                                 type=str,
                                 help='file for generated embeddings')

        # Model parameters
        self.parser.add_argument('-k',
                                 '--dim',
                                 default=400,
                                 type=int,
                                 help='train a 400 x k projection matrix')
        self.parser.add_argument(
            '-b',
            '--batch',
            default=4,
            type=int,
            help='positive examples in minibatch (total size=batch*(1+ratio)')
        self.parser.add_argument('-e',
                                 '--epochs',
                                 default=40,
                                 type=int,
                                 help='number of training epochs')
        self.parser.add_argument('-r',
                                 '--ratio',
                                 default=1,
                                 type=int,
                                 help='ratio of negative examples to positive')
        self.parser.add_argument('-l',
                                 '--learning',
                                 default=0.001,
                                 type=float,
                                 help='learning rate')
        self.parser.add_argument('-dr',
                                 '--dropout',
                                 default=0.1,
                                 type=float,
                                 help='amount of dropout to use')
        self.parser.add_argument('-t',
                                 '--threshold',
                                 default=0.5,
                                 type=float,
                                 help='threshold for binary classification')

        # Miscellaneous
        self.parser.add_argument('-ds',
                                 '--dataset',
                                 default='unicode',
                                 type=str,
                                 help='unicode or emojipedia')
        self.parser.add_argument('-D', '--debug', help='enable debugging')

        args = self.parser.parse_args()

        # dimensions of projected embeddings
        self.model_params = ModelParams(400,
                                        out_dim=args.dim,
                                        pos_ex=args.batch,
                                        max_epochs=args.epochs,
                                        neg_ratio=args.ratio,
                                        learning_rate=args.learning,
                                        dropout=args.dropout,
                                        class_threshold=args.threshold)

        # debug mode?
        self.debug = args.debug

        # data folder
        self.data_folder = args.dir

        # file for generated embeddings
        self.embeddings_file = args.embeddings

        # file for emoji mappings
        self.mapping_file = args.mapping

        # word2vec file
        self.word2vec_file = args.word

        # dataset to chose: unicode or emojipedia
        self.dataset = args.dataset
Пример #7
0
def __grid_search(
    remaining_params,
    current_params,
    results_dict,
    train_set,
    dev_set,
    kb,
    embeddings_array,
    ind2emoji,
    dataset_name,
    in_dim,
    learning_rate,
    threshold,
):
    if len(remaining_params) > 0:
        # Get a parameter
        param, values = remaining_params.popitem()

        # For each potential parameter, copy current_params and add the potential parameter to next_params
        for value in values:
            next_params = current_params.copy()
            next_params[param] = value

            # Perform grid search on the remaining params
            __grid_search(
                remaining_params=remaining_params.copy(),
                current_params=next_params,
                results_dict=results_dict,
                train_set=train_set,
                dev_set=dev_set,
                kb=kb,
                embeddings_array=embeddings_array,
                ind2emoji=ind2emoji,
                dataset_name=dataset_name,
                in_dim=in_dim,
                learning_rate=learning_rate,
                threshold=threshold,
            )
    else:
        model_params = ModelParams(
            in_dim=in_dim,
            out_dim=current_params["out_dim"],
            max_epochs=current_params["max_epochs"],
            pos_ex=current_params["pos_ex"],
            neg_ratio=current_params["ratio"],
            learning_rate=learning_rate,
            dropout=current_params["dropout"],
            class_threshold=threshold,
        )

        name = model_params.model_folder(dataset_name)
        # We know that the larger the batch size, the more epochs needed to convergence, therefore we modify the batch
        # size here
        model_params.max_epochs = int(
            model_params.max_epochs
            * math.sqrt(model_params.pos_ex)
            * (model_params.neg_ratio + 1)
        )

        results_dict[name] = train_save_evaluate(
            params=model_params,
            train_set=train_set,
            dev_set=dev_set,
            kb=kb,
            embeddings_array=embeddings_array,
            ind2emoji=ind2emoji,
            dataset_name=dataset_name,
        )

    return results_dict
Пример #8
0
    path: ProjectPath = ProjectPath("ATIS_PATH")
    meta = path.join("meta.json").get()
    train_paths = [
        path.join("train.sequences.txt").get(),
        path.join("train.labels.txt").get()
    ]
    train: DataSet = DataSet(train_paths[0], meta, train_paths[1], padding=30)
    train, dev = train.train_test_split(0.9)
    test_paths = [
        path.join("test.sequences.txt").get(),
        path.join("test.labels.txt").get()
    ]
    test: DataSet = DataSet(test_paths[0], meta, test_paths[1], padding=30)

    features: List[Feature] = create_features(path)
    params: ModelParams = ModelParams(lstm_layers=1,
                                      lstm_size=200,
                                      learning_rate=0.008)
    params.restore_best_weights()
    params.sgd_with_restarts_scheduler(train,
                                       batch_size=32,
                                       max_lr=0.008,
                                       min_lr=0.001)
    model = TaggingModel(features, train.column("label"),
                         train.column("doclabel"), params)
    model.train(train, dev, epochs=21)  # 3 cycles -> (3 + 6 + 12 = 21 epochs)
    TaggingModel.save(model, path.join("model").get())
    pred: Tuple[TaggingPrediction, ClassificationPrediction] = model.test(test)
    pred[0].evaluate()
    pred[1].evaluate()