def main(): # Set hyperparameters hp = { 'LAYERS': 1, 'THRESHOLD': 2, 'DECAY': 3, 'EPOCH': 25, 'WINDOW_SIZE': 3, 'DIM_UNI': 32, 'DIM_BI': 16, 'DIM_WORD': 16, 'DIM_CTYPE': 8, 'DIM_TAGEMB': 16, 'DIM_HIDDEN': 100, 'LEARNING_RATE': 0.075, 'DROPOUT_RATE': 0.2, 'TRAINSET': args.train, 'TESTSET': args.test, 'DEVSET': args.dev, 'DICTIONARY': args.dict, 'HYPERPARAMS': 'data/' + args.output + '.hp', 'MODEL': 'data/' + args.output + '.model', 'VOCAB': 'data/' + args.output + '.dict', 'EPOCH_MODEL': 'data/epoch.model', 'TMP_PRED': 'data/pred', 'TMP_GOLD': 'data/gold' } # Setup vocabuary files vocabs = prepro.create_vocabs_from_trainset(trainset=hp['TRAINSET'], fn_dictionary=hp['DICTIONARY'], fn_vocabs=hp['VOCAB']) # Update hyper-parameters hp['VOCAB_SIZE_UNI'] = len(vocabs[0]) hp['VOCAB_SIZE_BI'] = len(vocabs[1]) hp['VOCAB_SIZE_WORD'] = len(vocabs[2]) hp['VOCAB_SIZE_POSTAG'] = len(vocabs[3]) # Preprocess ws = hp['WINDOW_SIZE'] TrainData = prepro.from_file(filename=hp['TRAINSET'], window_size=ws, vocabs=vocabs) TestData = prepro.from_file(filename=hp['TESTSET'], window_size=ws, vocabs=vocabs) DevData = prepro.from_file(filename=hp['DEVSET'], window_size=ws, vocabs=vocabs) # Construct networks _model = model.Model(hp=hp) # Start training fit(hp, model=_model, train_data=TrainData, test_data=TestData, dev_data=DevData)
def fit(train_file, dev_file, test_file, model_name, dict_file=None, emb_file=None, delimiter='\t', newline='EOS', layers=1, min_count=3, decay=1, epoch=10, window_size=3, dim_uni=32, dim_bi=16, dim_word=16, dim_ctype=8, dim_tagemb=16, dim_hidden=100, learning_rate=0.1, dropout_rate=0.3, seed=1234): """Train a joint word segmentation and sequence labeling (e.g, POS-tagging, NER) model. args: - train_file (str): Path to a train file. - dev_file (str): Path to a development file for early stopping. - test_file (str): Path to a test file for evaluation. - model_name (str): Output model filename. - dict_file (str, optional): Path to a dictionary file. - emb_file (str, optional): Path to a pre-trained embedding file (word2vec format). - delimiter (str, optional): Separate word and tag in each line by 'delimiter'. - newline (str, optional): Separate lines in the file by 'newline'. - layers (int, optional): RNN Layer size. - min_count (int, optional): Ignores all words with total frequency lower than this. - decay (int, optional): Learning rate decay. - epoch (int, optional): Epoch size. - window_size (int, optional): Window size of the context characters for word segmentation. - dim_uni (int, optional): Dimensionality of the char-unigram vectors. - dim_bi (int, optional): Dimensionality of the char-bigram vectors. - dim_word (int, optional): Dimensionality of the word vectors. - dim_ctype (int, optional): Dimensionality of the character-type vectors. - dim_tagemb (int, optional): Dimensionality of the tag vectors. - dim_hidden (int, optional): Dimensionality of the BiLSTM's hidden layer. - learning_rate (float, optional): Learning rate of SGD. - dropout_rate (float, optional): Dropout rate of the input vector for BiLSTMs. - seed (int, optional): Random seed. return: - Nothing. After finish training, however, save the three model files (*.vocabs, *.params, *.hp) in the current directory. """ random.seed(seed) hp = OrderedDict({ 'LAYERS': layers, 'THRESHOLD': min_count, 'DECAY': decay, 'EPOCH': epoch, 'WINDOW_SIZE': window_size, 'DIM_UNI': dim_uni, 'DIM_BI': dim_bi, 'DIM_WORD': dim_word, 'DIM_CTYPE': dim_ctype, 'DIM_TAGEMB': dim_tagemb, 'DIM_HIDDEN': dim_hidden, 'LEARNING_RATE': learning_rate, 'DROPOUT_RATE': dropout_rate, 'SEED': seed, 'TRAINSET': train_file, 'TESTSET': test_file, 'DEVSET': dev_file, 'DICTIONARY': dict_file, 'EMBEDDING': emb_file, 'HYPERPARAMS': model_name + '.hp', 'MODEL': model_name + '.params', 'VOCAB': model_name + '.vocabs', 'EPOCH_MODEL': model_name + '_epoch.params' }) # Preprocess vocabs = prepro.create_vocabs_from_trainset(trainset=hp['TRAINSET'], fn_dictionary=hp['DICTIONARY'], fn_vocabs=hp['VOCAB'], delimiter=delimiter, newline=newline) if emb_file is not None: embs, dim_word = prepro.embedding_loader(fn_embedding=hp['EMBEDDING'], word2id=vocabs[2]) hp['DIM_WORD'] = dim_word else: embs = None TrainData = prepro.from_file(filename=hp['TRAINSET'], window_size=hp['WINDOW_SIZE'], vocabs=vocabs, delimiter=delimiter, newline=newline) TestData = prepro.from_file(filename=hp['TESTSET'], window_size=hp['WINDOW_SIZE'], vocabs=vocabs, delimiter=delimiter, newline=newline) DevData = prepro.from_file(filename=hp['DEVSET'], window_size=hp['WINDOW_SIZE'], vocabs=vocabs, delimiter=delimiter, newline=newline) # Update hyper-parameters hp['NUM_TRAIN'] = len(TrainData.ws_data) hp['NUM_TEST'] = len(TestData.ws_data) hp['NUM_DEV'] = len(DevData.ws_data) hp['VOCAB_SIZE_UNI'] = len(vocabs[0]) hp['VOCAB_SIZE_BI'] = len(vocabs[1]) hp['VOCAB_SIZE_WORD'] = len(vocabs[2]) hp['VOCAB_SIZE_POSTAG'] = len(vocabs[3]) # Construct networks _model = model.Model(hp=hp, embs=embs) # Start training _start(hp, model=_model, train_data=TrainData, test_data=TestData, dev_data=DevData)