Exemplo n.º 1
0
def main():
    # Set hyperparameters
    hp = {
        'LAYERS': 1,
        'THRESHOLD': 2,
        'DECAY': 3,
        'EPOCH': 25,
        'WINDOW_SIZE': 3,
        'DIM_UNI': 32,
        'DIM_BI': 16,
        'DIM_WORD': 16,
        'DIM_CTYPE': 8,
        'DIM_TAGEMB': 16,
        'DIM_HIDDEN': 100,
        'LEARNING_RATE': 0.075,
        'DROPOUT_RATE': 0.2,
        'TRAINSET': args.train,
        'TESTSET': args.test,
        'DEVSET': args.dev,
        'DICTIONARY': args.dict,
        'HYPERPARAMS': 'data/' + args.output + '.hp',
        'MODEL': 'data/' + args.output + '.model',
        'VOCAB': 'data/' + args.output + '.dict',
        'EPOCH_MODEL': 'data/epoch.model',
        'TMP_PRED': 'data/pred',
        'TMP_GOLD': 'data/gold'
    }

    # Setup vocabuary files
    vocabs = prepro.create_vocabs_from_trainset(trainset=hp['TRAINSET'],
                                                fn_dictionary=hp['DICTIONARY'],
                                                fn_vocabs=hp['VOCAB'])

    # Update hyper-parameters
    hp['VOCAB_SIZE_UNI'] = len(vocabs[0])
    hp['VOCAB_SIZE_BI'] = len(vocabs[1])
    hp['VOCAB_SIZE_WORD'] = len(vocabs[2])
    hp['VOCAB_SIZE_POSTAG'] = len(vocabs[3])
    # Preprocess
    ws = hp['WINDOW_SIZE']
    TrainData = prepro.from_file(filename=hp['TRAINSET'],
                                 window_size=ws,
                                 vocabs=vocabs)
    TestData = prepro.from_file(filename=hp['TESTSET'],
                                window_size=ws,
                                vocabs=vocabs)
    DevData = prepro.from_file(filename=hp['DEVSET'],
                               window_size=ws,
                               vocabs=vocabs)
    # Construct networks
    _model = model.Model(hp=hp)
    # Start training
    fit(hp,
        model=_model,
        train_data=TrainData,
        test_data=TestData,
        dev_data=DevData)
Exemplo n.º 2
0
def fit(train_file,
        dev_file,
        test_file,
        model_name,
        dict_file=None,
        emb_file=None,
        delimiter='\t',
        newline='EOS',
        layers=1,
        min_count=3,
        decay=1,
        epoch=10,
        window_size=3,
        dim_uni=32,
        dim_bi=16,
        dim_word=16,
        dim_ctype=8,
        dim_tagemb=16,
        dim_hidden=100,
        learning_rate=0.1,
        dropout_rate=0.3,
        seed=1234):
    """Train a joint word segmentation and sequence labeling (e.g, POS-tagging, NER) model.

    args:
        - train_file (str): Path to a train file.
        - dev_file (str): Path to a development file for early stopping.
        - test_file (str):  Path to a test file for evaluation.
        - model_name (str): Output model filename.
        - dict_file (str, optional): Path to a dictionary file.
        - emb_file (str, optional): Path to a pre-trained embedding file (word2vec format).
        - delimiter (str, optional): Separate word and tag in each line by 'delimiter'.
        - newline (str, optional):  Separate lines in the file by 'newline'.
        - layers (int, optional): RNN Layer size.
        - min_count (int, optional): Ignores all words with total frequency lower than this.
        - decay (int, optional): Learning rate decay.
        - epoch (int, optional): Epoch size.
        - window_size (int, optional): Window size of the context characters for word segmentation.
        - dim_uni (int, optional): Dimensionality of the char-unigram vectors.
        - dim_bi (int, optional): Dimensionality of the char-bigram vectors.
        - dim_word (int, optional): Dimensionality of the word vectors.
        - dim_ctype (int, optional): Dimensionality of the character-type vectors.
        - dim_tagemb (int, optional): Dimensionality of the tag vectors.
        - dim_hidden (int, optional): Dimensionality of the BiLSTM's hidden layer.
        - learning_rate (float, optional): Learning rate of SGD.
        - dropout_rate (float, optional): Dropout rate of the input vector for BiLSTMs.
        - seed (int, optional): Random seed.

    return:
        - Nothing. After finish training, however,
          save the three model files (*.vocabs, *.params, *.hp) in the current directory.

    """

    random.seed(seed)

    hp = OrderedDict({
        'LAYERS': layers,
        'THRESHOLD': min_count,
        'DECAY': decay,
        'EPOCH': epoch,
        'WINDOW_SIZE': window_size,
        'DIM_UNI': dim_uni,
        'DIM_BI': dim_bi,
        'DIM_WORD': dim_word,
        'DIM_CTYPE': dim_ctype,
        'DIM_TAGEMB': dim_tagemb,
        'DIM_HIDDEN': dim_hidden,
        'LEARNING_RATE': learning_rate,
        'DROPOUT_RATE': dropout_rate,
        'SEED': seed,
        'TRAINSET': train_file,
        'TESTSET': test_file,
        'DEVSET': dev_file,
        'DICTIONARY': dict_file,
        'EMBEDDING': emb_file,
        'HYPERPARAMS': model_name + '.hp',
        'MODEL': model_name + '.params',
        'VOCAB': model_name + '.vocabs',
        'EPOCH_MODEL': model_name + '_epoch.params'
    })

    # Preprocess
    vocabs = prepro.create_vocabs_from_trainset(trainset=hp['TRAINSET'],
                                                fn_dictionary=hp['DICTIONARY'],
                                                fn_vocabs=hp['VOCAB'],
                                                delimiter=delimiter,
                                                newline=newline)

    if emb_file is not None:
        embs, dim_word = prepro.embedding_loader(fn_embedding=hp['EMBEDDING'],
                                                 word2id=vocabs[2])
        hp['DIM_WORD'] = dim_word
    else:
        embs = None

    TrainData = prepro.from_file(filename=hp['TRAINSET'],
                                 window_size=hp['WINDOW_SIZE'],
                                 vocabs=vocabs,
                                 delimiter=delimiter,
                                 newline=newline)
    TestData = prepro.from_file(filename=hp['TESTSET'],
                                window_size=hp['WINDOW_SIZE'],
                                vocabs=vocabs,
                                delimiter=delimiter,
                                newline=newline)
    DevData = prepro.from_file(filename=hp['DEVSET'],
                               window_size=hp['WINDOW_SIZE'],
                               vocabs=vocabs,
                               delimiter=delimiter,
                               newline=newline)

    # Update hyper-parameters
    hp['NUM_TRAIN'] = len(TrainData.ws_data)
    hp['NUM_TEST'] = len(TestData.ws_data)
    hp['NUM_DEV'] = len(DevData.ws_data)
    hp['VOCAB_SIZE_UNI'] = len(vocabs[0])
    hp['VOCAB_SIZE_BI'] = len(vocabs[1])
    hp['VOCAB_SIZE_WORD'] = len(vocabs[2])
    hp['VOCAB_SIZE_POSTAG'] = len(vocabs[3])

    # Construct networks
    _model = model.Model(hp=hp, embs=embs)

    # Start training
    _start(hp,
           model=_model,
           train_data=TrainData,
           test_data=TestData,
           dev_data=DevData)