def __grid_search(remaining_params, current_params, results_dict, train_set, dev_set, kb, embeddings_array, ind2emoji, dataset_name, in_dim, learning_rate, threshold): if len(remaining_params) > 0: # Get a parameter param, values = remaining_params.popitem() # For each potential parameter, copy current_params and add the potential parameter to next_params for value in values: next_params = current_params.copy() next_params[param] = value # Perform grid search on the remaining params __grid_search(remaining_params=remaining_params.copy(), current_params=next_params, results_dict=results_dict, train_set=train_set, dev_set=dev_set, kb=kb, embeddings_array=embeddings_array, ind2emoji=ind2emoji, dataset_name=dataset_name, in_dim=in_dim, learning_rate=learning_rate, threshold=threshold) else: model_params = ModelParams(in_dim=in_dim, out_dim=current_params["out_dim"], max_epochs=current_params["max_epochs"], pos_ex=current_params["pos_ex"], neg_ratio=current_params["ratio"], learning_rate=learning_rate, dropout=current_params["dropout"], class_threshold=threshold) name = model_params.model_folder(dataset_name) # We know that the larger the batch size, the more epochs needed to convergence, therefore we modify the batch # size here model_params.max_epochs = int(model_params.max_epochs * math.sqrt(model_params.pos_ex) * (model_params.neg_ratio + 1)) results_dict[name] = train_save_evaluate( params=model_params, train_set=train_set, dev_set=dev_set, kb=kb, embeddings_array=embeddings_array, ind2emoji=ind2emoji, dataset_name=dataset_name) return results_dict
def __run_fold(self, fold: DataPaths): conf = self.get_subconfig("input") padding: int = conf.get("padding") train: DataSet = DataSet(fold.train_path, fold.meta_path, padding=padding) test: DataSet = DataSet(fold.test_path, fold.meta_path, padding=padding) valid: DataSet = DataSet(fold.valid_path, fold.meta_path, padding=padding) if fold.valid_path else None features: List[Feature] = self.__create_features( train, self.base_path, fold) model_params = ModelParams(**self.get_subconfig("model")) model = TaggingModel(features, train.column(self.config["input.target_col"]), model_params) model.train(train, valid=valid, **self.get_subconfig("train")) pred: TaggingPrediction = model.test(test) self.after_fold(model, pred, fold) del model, features
return final_sample, final_review if __name__ == '__main__': #Set Global Variables for emoji2vec in_dim = 100 # Length of word2vec vectors out_dim = 100 # Desired dimension of output vectors pos_ex = 4 neg_ratio = 1 max_epochs = 40 dropout = 0.1 params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs, neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5) e2v_ours_path = params.model_folder('unicode') + '/emoji2vec_100.bin' # Load the FastText word vectors and emoji vectors w2v = gs.FastText.load(os.path.join(w2v_path, 'fasttext_model')) e2v_ours = gs.KeyedVectors.load_word2vec_format(e2v_ours_path, binary=True) # Combine the word vectors and emoji vectors together p2v_our_emoji = p2v.Phrase2Vec(out_dim, w2v, e2v=e2v_ours) # #=========================For the unprocessed text======================================== tweet_combined_dataframe = utils.read_local_csv_file( path=read_data.tweet_combined_path,
# struct.append(([0], 'Y', 1)) struct.append(([1], 'Y', 0)) struct.append(([0], 'Z', 1)) struct.append(([1], 'X', 0)) struct += [ ([], 'X', 0), ([], 'Y', 0), ] model = Model(2, struct) # model = Model(1, [ # ([], 'Y', 0) # ]) params = UnsetParams() tparams = ModelParams(model, 2 * np.pi * np.random.rand(len(model.structure) + 1)) tparams.params[-1] = np.random.rand(1) - 0.5 cost_evolution = [] for repi in range(5): tparams, one_ce = learner.learn(tparams, X, Y, 0.01, 1000) cost_evolution.append(one_ce) if tparams.cost < params.cost: params = tparams.copy() print(tparams.cost) tparams.params += np.random.normal(0, 1, tparams.params.shape) print(params.cost) export.write_qs(params) Yc = params.classify(X)
def __init__(self): self.parser = arg.ArgumentParser( description="Parser for training/evaluationg emoji2vec model" ) # Directories/files self.parser.add_argument( "-d", "--dir", default="./data/training/", type=str, help="directory for training data", ) self.parser.add_argument( "-w", "--word", default="./data/word2vec/GoogleNews-vectors-negative300.bin.gz", type=str, help="path to the word2vec file", ) self.parser.add_argument( "-m", "--mapping", default="emoji_mapping.p", type=str, help="emoji index mapping file", ) self.parser.add_argument( "-em", "--embeddings", default="generated_embeddings.p", type=str, help="file for generated embeddings", ) # Model parameters self.parser.add_argument( "-k", "--dim", default=300, type=int, help="train a 300 x k projection matrix", ) self.parser.add_argument( "-b", "--batch", default=64, type=int, help="positive examples in minibatch (total size=batch*(1+ratio)", ) self.parser.add_argument( "-e", "--epochs", default=20, type=int, help="number of training epochs", ) self.parser.add_argument( "-r", "--ratio", default=1, type=int, help="ratio of negative examples to positive", ) self.parser.add_argument( "-l", "--learning", default=0.001, type=float, help="learning rate" ) self.parser.add_argument( "-dr", "--dropout", default=0.1, type=float, help="amount of dropout to use", ) self.parser.add_argument( "-t", "--threshold", default=0.5, type=float, help="threshold for binary classification", ) # Miscellaneous self.parser.add_argument( "-ds", "--dataset", default="unicode", type=str, help="unicode or emojipedia", ) self.parser.add_argument("-D", "--debug", help="enable debugging") args = self.parser.parse_args() # dimensions of projected embeddings self.model_params = ModelParams( 300, out_dim=args.dim, pos_ex=args.batch, max_epochs=args.epochs, neg_ratio=args.ratio, learning_rate=args.learning, dropout=args.dropout, class_threshold=args.threshold, ) # debug mode? self.debug = args.debug # data folder self.data_folder = args.dir # file for generated embeddings self.embeddings_file = args.embeddings # file for emoji mappings self.mapping_file = args.mapping # word2vec file self.word2vec_file = args.word # dataset to chose: unicode or emojipedia self.dataset = args.dataset
def __init__(self): self.parser = arg.ArgumentParser( description='Parser for training/evaluationg emoji2vec model') # Directories/files self.parser.add_argument('-d', '--dir', default='./data/training/', type=str, help='directory for training data') self.parser.add_argument( '-w', '--word', default='./data/w2v/w2v.twitter.edinburgh10M.400d.txt.word2vec.bin', type=str, help='path to the word2vec file') self.parser.add_argument('-m', '--mapping', default='emoji_mapping.p', type=str, help='emoji index mapping file') self.parser.add_argument('-em', '--embeddings', default='generated_embeddings.p', type=str, help='file for generated embeddings') # Model parameters self.parser.add_argument('-k', '--dim', default=400, type=int, help='train a 400 x k projection matrix') self.parser.add_argument( '-b', '--batch', default=4, type=int, help='positive examples in minibatch (total size=batch*(1+ratio)') self.parser.add_argument('-e', '--epochs', default=40, type=int, help='number of training epochs') self.parser.add_argument('-r', '--ratio', default=1, type=int, help='ratio of negative examples to positive') self.parser.add_argument('-l', '--learning', default=0.001, type=float, help='learning rate') self.parser.add_argument('-dr', '--dropout', default=0.1, type=float, help='amount of dropout to use') self.parser.add_argument('-t', '--threshold', default=0.5, type=float, help='threshold for binary classification') # Miscellaneous self.parser.add_argument('-ds', '--dataset', default='unicode', type=str, help='unicode or emojipedia') self.parser.add_argument('-D', '--debug', help='enable debugging') args = self.parser.parse_args() # dimensions of projected embeddings self.model_params = ModelParams(400, out_dim=args.dim, pos_ex=args.batch, max_epochs=args.epochs, neg_ratio=args.ratio, learning_rate=args.learning, dropout=args.dropout, class_threshold=args.threshold) # debug mode? self.debug = args.debug # data folder self.data_folder = args.dir # file for generated embeddings self.embeddings_file = args.embeddings # file for emoji mappings self.mapping_file = args.mapping # word2vec file self.word2vec_file = args.word # dataset to chose: unicode or emojipedia self.dataset = args.dataset
def __grid_search( remaining_params, current_params, results_dict, train_set, dev_set, kb, embeddings_array, ind2emoji, dataset_name, in_dim, learning_rate, threshold, ): if len(remaining_params) > 0: # Get a parameter param, values = remaining_params.popitem() # For each potential parameter, copy current_params and add the potential parameter to next_params for value in values: next_params = current_params.copy() next_params[param] = value # Perform grid search on the remaining params __grid_search( remaining_params=remaining_params.copy(), current_params=next_params, results_dict=results_dict, train_set=train_set, dev_set=dev_set, kb=kb, embeddings_array=embeddings_array, ind2emoji=ind2emoji, dataset_name=dataset_name, in_dim=in_dim, learning_rate=learning_rate, threshold=threshold, ) else: model_params = ModelParams( in_dim=in_dim, out_dim=current_params["out_dim"], max_epochs=current_params["max_epochs"], pos_ex=current_params["pos_ex"], neg_ratio=current_params["ratio"], learning_rate=learning_rate, dropout=current_params["dropout"], class_threshold=threshold, ) name = model_params.model_folder(dataset_name) # We know that the larger the batch size, the more epochs needed to convergence, therefore we modify the batch # size here model_params.max_epochs = int( model_params.max_epochs * math.sqrt(model_params.pos_ex) * (model_params.neg_ratio + 1) ) results_dict[name] = train_save_evaluate( params=model_params, train_set=train_set, dev_set=dev_set, kb=kb, embeddings_array=embeddings_array, ind2emoji=ind2emoji, dataset_name=dataset_name, ) return results_dict
path: ProjectPath = ProjectPath("ATIS_PATH") meta = path.join("meta.json").get() train_paths = [ path.join("train.sequences.txt").get(), path.join("train.labels.txt").get() ] train: DataSet = DataSet(train_paths[0], meta, train_paths[1], padding=30) train, dev = train.train_test_split(0.9) test_paths = [ path.join("test.sequences.txt").get(), path.join("test.labels.txt").get() ] test: DataSet = DataSet(test_paths[0], meta, test_paths[1], padding=30) features: List[Feature] = create_features(path) params: ModelParams = ModelParams(lstm_layers=1, lstm_size=200, learning_rate=0.008) params.restore_best_weights() params.sgd_with_restarts_scheduler(train, batch_size=32, max_lr=0.008, min_lr=0.001) model = TaggingModel(features, train.column("label"), train.column("doclabel"), params) model.train(train, dev, epochs=21) # 3 cycles -> (3 + 6 + 12 = 21 epochs) TaggingModel.save(model, path.join("model").get()) pred: Tuple[TaggingPrediction, ClassificationPrediction] = model.test(test) pred[0].evaluate() pred[1].evaluate()