def Base(eta, l2, tagversion, hidden, batchsize):
    params.outfile = 'ccctag_CRF_Bilstm_Viterbi_'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'
    params.batchsize = batchsize
    params.hidden = hidden
    params.embedsize = 100
    params.emb = 0
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.tagversion = tagversion

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    elif (tagversion == 2):
        tagger = getTagger('../supertag_data/tagger_400')
    else:
        tagger = getTagger('../supertag_data/tagger')

    params.num_labels = len(tagger)
    print len(tagger)
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + "_LearningRate" + '_' + str(
                params.eta) + '_' + str(l2) + str(
                    hidden) + '_tagversoin_' + str(tagversion)
    #examples are shuffled data

    traindata = getSupertagData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getSupertagData(params.dev, words, tagger, train=False)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getSupertagData(params.test, words, tagger, train=False)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def Base(eta, l2, num_filters, emb, hidden):
	params.outfile = 'POS_Bilstm_CNN_CRF_'
	params.dataf = '../pos_data/oct27.traindev.proc.cnn'
	params.dev = '../pos_data/oct27.test.proc.cnn'
	params.test = '../pos_data/daily547.proc.cnn'
	params.batchsize = 10
	params.hidden = hidden
	params.embedsize = 100
	params.emb = emb
	params.eta = eta
	params.L2 = l2
	params.dropout = 1
	params.num_labels = 25
        params.char_embedd_dim = 30
        params.num_filters = num_filters    

	(words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
	#words.update({'UUUNKKK':0})
	#a=[0]*len(We[0])
	#newWe = []
	#newWe.append(a)
	#We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../pos_data/tagger')
       
        char_dic = getTagger('../pos_data/char_dic')

        params.char_dic = char_dic

        scale = np.sqrt(3.0 / params.char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)
        
	print char_dic
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_dropout_'+ str(params.dropout) + "_LearningRate"+'_'+str(params.eta)+ '_' + str(l2)+ str(num_filters) + '_emb_'+ str(emb)+ '_hidden_'+ str(hidden)
                                #examples are shuffled data
	
	traindata = getData_and_Char(params.dataf, words, tagger, char_dic)
 	devdata = getData_and_Char(params.dev, words, tagger, char_dic)
	testdata = getData_and_Char(params.test, words, tagger, char_dic)


	print 'test set', len(traindata[2])
	#print Y
	print "Using Training Data"+params.dataf
	print "Using Word Embeddings with Dimension "+str(params.embedsize)
	print "Saving models to: "+params.outfile
	#lm = LM_model(params)
	#lm.train(trainy0, devy0, params)	


	tm = CRF_model(We, char_embedd_table, params)
	tm.train(traindata, devdata, testdata, params)
Пример #3
0
def Base(eta, epoches):
    params.outfile = 'Pos_sgd_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    num_filters = 30
    emb = 1
    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1

    params.hidden_inf = 300

    params.epoches = epoches

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    #print We.shape
    tagger = getTagger('../pos_data/tagger')
    #print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + '_LearningRate_' + str(
        params.eta) + '_epoches_' + str(epoches)
    print params.outfile

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    from model_selection_pos_finalTuning_sgd_inference import CRF_model
    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #4
0
def Base(eta, epoches):
    params.outfile = 'Pos_sgd_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    l3 = 0
    emb = 0
    params.char_embedd_dim = 30

    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0

    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    params.epoches = epoches

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    #print We.shape
    tagger = getTagger('../pos_data/tagger')
    #print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    params.outfile = params.outfile + '_LearningRate_' + str(
        params.eta) + '_epoches_' + str(epoches)
    print params.outfile

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    from model_selection_sgd_simple_inference import CRF_model
    tm = CRF_model(We, params)
    tm.train(train, dev, test, params)
Пример #5
0
def Base(eta, l2):
    params.outfile = 'Base_model'
    params.dataf = 'data/oct27.traindev.proc.cnn'
    params.dev = 'data/oct27.test.proc.cnn'
    params.test = 'data/daily547.proc.cnn'
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.frac = 0.1
    params.emb = 0

    (words, We) = getWordmap('wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    tagger = getTagger('data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_' + "LearningRate" + '_' + str(
            params.eta) + '_' + str(
                params.hiddensize) + '_' + str(l2) + '.pickle'

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    tm = base_model(We, params)
    tm.train(traindata, devdata, testdata, params)
def Base(eta, l2, morepara, emb, batchsize):
    params.outfile = 'NER_CRF_lstm_Viterti_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'
    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.num_labels = 17

    params.morepara = morepara

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')
    print tagger
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb)
    #examples are shuffled data

    trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger)
    traindata = trainx0, trainy0
    #N = int(params.frac*len(trainx0))
    #traindata = trainx0[:N], trainy0[:N]

    devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes(
        params.dev, words, tagger)
    devdata = devx0, devy0
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes(
        params.test, words, tagger)
    testdata = testx0, testy0

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def Base(eta, l2, morepara, emb, batchsize):
    params.outfile = 'POS_CRF_Bilstm_Viterbi_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'
    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.num_labels = 25

    params.morepara = morepara

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb)
    #examples are shuffled data

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
Пример #8
0
def Base(eta, l3, batchsize, inf, hidden_inf, tagversion, num_filters):
    params.outfile = 'CRF_Inf_ccctag'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'

    params.batchsize = batchsize
    params.hidden = 400
    params.embedsize = 100
    params.emb = 1
    params.eta = eta
    params.dropout = 1
    params.hidden_inf = hidden_inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape

    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    else:
        tagger = getTagger('../supertag_data/tagger_400')
    params.num_labels = len(tagger)

    params.words = words
    params.tagger = tagger

    char_dic = getTagger('../supertag_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_num_filters_' + str(num_filters) + '_inf_' + str(
                    params.inf) + '_hidden_' + str(
                        params.hidden_inf) + '_tagversion_' + str(tagversion)

    train = getSupertagData_and_Char(params.dataf, words, tagger, char_dic)

    dev = getSupertagData_and_Char(params.dev,
                                   words,
                                   tagger,
                                   char_dic,
                                   train=False)

    test = getSupertagData_and_Char(params.test,
                                    words,
                                    tagger,
                                    char_dic,
                                    train=False)

    if (inf == 0) or (inf == 1):
        from model_selection_ccctag_inference import CRF_model
        tm = CRF_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from model_selection_inference_ccctag_seq2seq import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf  # 512
        params.en_hidden_size = hidden_inf

        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
    else:

        from model_selection_inference_ccctag_seq2seq_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf  # 512
        params.en_hidden_size = hidden_inf

        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
Пример #9
0
def Base(eta, l2, num_filters, inf, hidden_size):
    params.outfile = 'base_pos_inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'
    params.batchsize = 10
    params.hidden = hidden_size
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.emb = 1
    params.inf = inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.en_hidden_size = hidden_size
    """
	change it later
	"""
    params.de_hidden_size = hidden_size
    params.lstm_layers_num = 1
    params.num_labels = 25
    params.layers_num = 3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_LearningRate_' + str(
            params.eta) + '_inf_' + str(inf) + '_hidden_' + str(
                params.hidden) + '_' + str(l2)

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    if (inf == 0) or (inf == 1):
        from base_model_selection import base_model
        tm = base_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from seq2seq_att_pos import Seq2Seq
        tm = Seq2Seq(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from self_att import Transformer
        tm = Transformer(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
def Base(eta, epoches):
	params.outfile = 'CRF_Inf_ccctag'
	params.dataf = '../supertag_data/train.dat'
	params.dev = '../supertag_data/dev.dat'
	params.test = '../supertag_data/test.dat'

        l3 = 0
        tagversion = 2
        batchsize = 10


        params.epoches = epoches
	params.batchsize = batchsize
        params.hidden = 512
        params.embedsize = 100
        params.emb = 0
        params.eta = eta
        params.dropout = 0


        num_filters = 30
        params.char_embedd_dim = 30
        params.num_filters = num_filters

     
        params.regutype = 0
        params.annealing = 0
        params.L3 = l3
	

	(words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
	words.update({'UUUNKKK':0})
	a=[0]*len(We[0])
	newWe = []
	newWe.append(a)
	We = newWe + We
	We = np.asarray(We).astype('float32')
	#print We.shape

	if (tagversion==0):
                tagger = getTagger('../supertag_data/tagger_100')
        elif(tagversion==1):
                tagger = getTagger('../supertag_data/tagger_200')
        else:
                tagger = getTagger('../supertag_data/tagger_400')
        params.num_labels = len(tagger)	

	params.words = words
	params.tagger = tagger
       
        char_dic = getTagger('../supertag_data/char_dic')

        params.char_dic = char_dic

        scale = np.sqrt(3.0 / params.char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

	params.outfile = params.outfile + '_LearningRate_'+str(params.eta) +'_epoches_'+ str(epoches)

        print params.outfile	
        train = getSupertagData_and_Char(params.dataf, words, tagger, char_dic)

        dev = getSupertagData_and_Char(params.dev, words, tagger, char_dic, train=False)

        test = getSupertagData_and_Char(params.test, words, tagger, char_dic, train=False)	

        #print len(test[0])

        from model_selection_ccctag_sgd_simple_inference import CRF_model
        tm = CRF_model(We, params)
        tm.train(train, dev, test, params)
Пример #11
0
def Base(eta, l3, emb, batchsize, inf, hidden_inf):
    params.outfile = 'h_CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0
    params.hidden_inf = hidden_inf

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_' + str(params.hidden_inf)

    trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger)
    traindata = trainx0, trainy0

    devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes(
        params.dev, words, tagger)
    devdata = devx0, devy0
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes(
        params.test, words, tagger)
    testdata = testx0, testy0

    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    if (inf == 0) or (inf == 1):
        from model_selection_NER_inference import CRF_model
        tm = CRF_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 2):
        from model_selection_inference_NER_seq2seq_h import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        params.outfile = 'h_de_hidden_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)

    else:
        from model_selection_inference_NER_seq2seq_h_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        params.outfile = 'h_de_hidden_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def Base(eta, l2, num_filters, emb, hidden):
    params.outfile = 'NER_BiLSTM_CNN_CRF_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'
    params.batchsize = 10
    params.hidden = hidden
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.num_labels = 17
    params.char_embedd_dim = 30
    params.num_filters = num_filters

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')
    print tagger
    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + "_LearningRate" + '_' + str(
                params.eta) + '_' + str(l2) + '_' + str(
                    num_filters) + '_hidden_' + str(hidden)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #13
0
def Base(eta, l2, inf, hidden_size):
	params.outfile = 'h_base_ner_inf_'
	params.dataf = '../ner_data/eng.train.bioes.conll'
        params.dev = '../ner_data/eng.dev.bioes.conll'
        params.test = '../ner_data/eng.test.bioes.conll'

	params.batchsize = 10
	params.hidden = hidden_size
	params.embedsize = 100
	params.eta = eta
	params.L2 = l2
	params.dropout = 0
	params.emb =0	
	params.inf = inf

	params.en_hidden_size= hidden_size
	params.de_hidden_size= hidden_size
	params.lstm_layers_num =1
	params.num_labels = 17	

	(words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
	words.update({'UUUNKKK':0})
	a=[0]*len(We[0])
	newWe = []
	newWe.append(a)
	We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../ner_data/ner_bioes')

	params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
	print tagger
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_LearningRate_'+str(params.eta)+ '_inf_' +str(inf) + '_' + str(l2) + '_'+ str(hidden_size)
                                #examples are shuffled data
	trainx0, trainy0, _ , _ = Get_Ner_bioes(params.dataf, words, tagger)
        traindata = trainx0, trainy0
        #N = int(params.frac*len(trainx0))
        #traindata = trainx0[:N], trainy0[:N]


        devx0, devy0,  params.devrawx, params.devpos = Get_Ner_bioes(params.dev, words, tagger)
        devdata = devx0, devy0
        print devy0[:10]
        print 'dev set',  len(devx0)
        testx0, testy0, params.testrawx, params.testpos  = Get_Ner_bioes(params.test, words, tagger)
        testdata = testx0, testy0


        print 'test set', len(testx0)
        #print Y
        print "Using Training Data"+params.dataf
        print "Using Word Embeddings with Dimension "+str(params.embedsize)
        print "Saving models to: "+params.outfile

	
		
	if (inf ==0) or (inf==1):
		tm = base_model(We, params)
		tm.train(traindata, devdata, testdata, params)
	#elif(inf ==2):
	#	from seq2seq import Seq2Seq
	#	tm = Seq2Seq(We, params)
	#	tm.train(traindata, devdata, testdata, params)
	elif(inf ==2):
                from seq2seq_att_ner_h import Seq2Seq
		#from seq2seq_att_ner_beamsearch import Seq2Seq
		#params.de_hidden_size=200
		#params.outfile = 'de_hidden_200_' + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
	elif(inf ==3):
                #from seq2seq_att_ner import Seq2Seq
                from seq2seq_att_ner_h_beamsearch import Seq2Seq
                #params.de_hidden_size=200
                #params.outfile = 'de_hidden_200_' + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)

	elif(inf ==4):
                #from seq2seq_att_all import Seq2Seq
		from seq2seq_local_att_ner import Seq2Seq

		params.window =int(sys.argv[5])
		params.outfile = 'local_att_window_' + str(params.window)+ '_attweight_' +  sys.argv[6] + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
def Base(eta, l3, emb, batchsize, inf, hidden_inf, tagversion):
    params.outfile = 'h_ccctag_CRF_Inf_'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'

    params.batchsize = batchsize
    params.hidden = 512
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0
    params.hidden_inf = hidden_inf

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape

    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    else:
        tagger = getTagger('../supertag_data/tagger_400')
    params.num_labels = len(tagger)

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_' + str(
                        params.hidden_inf) + '_tagversion_' + str(tagversion)

    traindata = getSupertagData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getSupertagData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getSupertagData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    if (inf == 0) or (inf == 1):
        tm = CRF_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 2):
        #from model_selection_inference_ccctag_seq2seq import CRF_seq2seq_model
        from model_selection_inference_ccctag_seq2seq_new import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf  # 512
        params.en_hidden_size = hidden_inf
        params.outfile = 'h_ccctag_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    else:
        #from model_selection_inference_ccctag_seq2seq_beamsearch import CRF_seq2seq_model
        from model_selection_inference_ccctag_seq2seq_new_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf  # 512
        params.en_hidden_size = hidden_inf
        params.outfile = 'h_ccctag_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
Пример #15
0
    help=
    "different traing method  0:margin rescaling, 1:contrastive, 2:perceptron, 3: slack rescaling",
    type=int,
    default=0)
params = parser.parse_args()

params.dataf = '../pos_data/oct27.traindev.proc.cnn'
params.dev = '../pos_data/oct27.test.proc.cnn'
params.test = '../pos_data/daily547.proc.cnn'

params.hidden = 100
params.embedsize = 100

(words, We) = getWordmap('wordvects.tw100w5-m40-it2')
We = np.asarray(We).astype('float32')
tagger = getTagger('../pos_data/tagger')
params.tagger = tagger
params.words = words
params.outfile = "ADV_CRF_LSTM_LM_Batchsize" + '_' + str(
    params.batchsize) + '_dropout_' + str(
        params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
            params.l2) + '_' + str(params.l3) + '_emb_' + str(params.emb)

traindata = getData(params.dataf, words, tagger)
trainx0, trainy0 = traindata
devdata = getData(params.dev, words, tagger)
devx0, devy0 = devdata
testdata = getData(params.test, words, tagger)
testx0, testy0 = testdata

tm = GAN_CRF_model(We, params)
def Base(eta, l2, inf, tagversion, hidden):
    params.outfile = 'h_base_ccctag_inf_'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'
    params.batchsize = 10
    params.hidden = hidden
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.emb = 0
    params.inf = inf

    params.en_hidden_size = hidden
    params.de_hidden_size = hidden
    params.lstm_layers_num = 1

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape

    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    else:
        tagger = getTagger('../supertag_data/tagger_400')
    params.num_labels = len(tagger)

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_LearningRate_' + str(
            params.eta) + '_inf_' + str(inf) + '_hidden_' + str(
                params.hidden) + '_' + str(tagversion) + '_' + str(l2)
    #examples are shuffled data

    traindata = getSupertagData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getSupertagData(params.dev, words, tagger, train=False)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getSupertagData(params.test, words, tagger, train=False)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    if (inf == 0) or (inf == 1):
        tm = base_model(We, params)
        tm.train(traindata, devdata, testdata, params)
    #elif(inf ==2):
    #	from seq2seq import Seq2Seq
    #	tm = Seq2Seq(We, params)
    #	tm.train(traindata, devdata, testdata, params)
    elif (inf == 2):
        #from seq2seq_att_pos import Seq2Seq
        from seq2seq_att_pos_new import Seq2Seq
        tm = Seq2Seq(We, params)
        tm.train(traindata, devdata, testdata, params)
    elif (inf == 3):
        ##from seq2seq_att_pos_beamsearch import Seq2Seq
        from seq2seq_att_pos_new_beamsearch import Seq2Seq
        tm = Seq2Seq(We, params)
        tm.train(traindata, devdata, testdata, params)
Пример #17
0
def Base(eta, l2, inf, hidden_size):
	params.outfile = 'h_base_pos_inf_'
	params.dataf = '../pos_data/oct27.traindev.proc.cnn'
	params.dev = '../pos_data/oct27.test.proc.cnn'
	params.test = '../pos_data/daily547.proc.cnn'
	params.batchsize = 10
	params.hidden = hidden_size
	params.embedsize = 100
	params.eta = eta
	params.L2 = l2
	params.dropout = 0
	params.emb =0	
	params.inf = inf

	params.en_hidden_size= hidden_size

	"""
	change it later
	"""
	params.de_hidden_size= hidden_size
	params.lstm_layers_num =1
	params.num_labels = 25	

	(words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
	#words.update({'UUUNKKK':0})
	#a=[0]*len(We[0])
	#newWe = []
	#newWe.append(a)
	#We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../pos_data/tagger')
	print tagger
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_LearningRate_'+str(params.eta)+ '_inf_' +str(inf) +'_hidden_'+ str(params.hidden)+ '_' + str(l2)
                                #examples are shuffled data
	
	traindata = getData(params.dataf, words, tagger)
	trainx0, trainy0 = traindata
	#N = int(params.frac*len(trainx0))
	#traindata = trainx0[:N], trainy0[:N]
	
 	devdata = getData(params.dev, words, tagger)
	devx0, devy0 = devdata
	print 'dev set',  len(devx0)
	testdata = getData(params.test, words, tagger)
	testx0, testy0 = testdata	

	print 'test set', len(testx0)
	print "Using Training Data"+params.dataf
	print "Using Word Embeddings with Dimension "+str(params.embedsize)
	print "Saving models to: "+params.outfile
	
	if (inf ==0) or (inf==1):
		tm = base_model(We, params)
		tm.train(traindata, devdata, testdata, params)
	#elif(inf ==2):
	#	from seq2seq import Seq2Seq
	#	tm = Seq2Seq(We, params)
	#	tm.train(traindata, devdata, testdata, params)
	elif(inf ==2):
                from seq2seq_att_pos_h import Seq2Seq
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
	elif(inf ==3):
                from seq2seq_att_pos_h_beamsearch import Seq2Seq
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
def Base(eta, l3, epoches, warmstart):
    params.outfile = 'CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    emb = 1
    params.batchsize = 10
    params.hidden = 200
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1

    params.char_embedd_dim = 30
    num_filters = 50
    params.num_filters = num_filters
    params.epoches = epoches

    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    params.hidden_inf = 200

    params.WarmStart = warmstart

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_LearningRate_' + str(
            params.eta) + '_' + str(l3) + '_emb_' + str(emb)
    print params.outfile

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char

    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    from model_selection_NER_sgd_inference import CRF_model
    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #19
0
def Base(eta, l3, emb, num_filters, inf, hidden_inf):
    params.outfile = 'Pos_CRF_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1
    params.hidden_inf = hidden_inf
    params.small = 0

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_inf_' + str(params.hidden_inf)

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    if (inf == 0) or (inf == 1):
        from model_selection_inference import CRF_model
        tm = CRF_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from model_selection_inference_seq2seq import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from model_selection_inference_seq2seq_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
Пример #20
0
def Base(eta, l3, emb, num_filters, inf, hidden_inf):
    params.outfile = 'CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = 10
    params.hidden = 200
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1
    params.hidden_inf = hidden_inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters

    params.inf = inf
    params.regutype = 0
    params.annealing = 1
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters
    ) + '_dropout_' + str(params.dropout) + '_LearningRate_' + str(
        params.eta) + '_' + str(l3) + '_emb_' + str(emb) + '_inf_' + str(
            params.inf) + '_hidden_' + str(
                params.hidden_inf) + '_annealing_' + str(params.annealing)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char

    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    if (inf == 0) or (inf == 1):
        from model_selection_NER_inference import CRF_model
        tm = CRF_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from model_selection_inference_NER_seq2seq import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    else:
        from model_selection_inference_NER_seq2seq_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
Пример #21
0
def Base(eta, l2, tagversion, hidden, num_filters):
    params.outfile = 'ccctag_BiLSTM_CNN_CRF_'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'
    params.batchsize = 10
    params.hidden = hidden
    params.embedsize = 100
    params.emb = 1
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.tagversion = tagversion
    params.char_embedd_dim = 30
    params.num_filters = num_filters

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    else:
        tagger = getTagger('../supertag_data/tagger_400')
    params.num_labels = len(tagger)
    print len(tagger)

    char_dic = getTagger('../supertag_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + "num_filters_" + str(
        num_filters
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + '_' + str(hidden) + '_emb_' + str(
            params.emb) + '_tagversoin_' + str(tagversion)

    train = getSupertagData_and_Char(params.dataf, words, tagger, char_dic)

    dev = getSupertagData_and_Char(params.dev,
                                   words,
                                   tagger,
                                   char_dic,
                                   train=False)

    test = getSupertagData_and_Char(params.test,
                                    words,
                                    tagger,
                                    char_dic,
                                    train=False)

    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #22
0
def Base(eta, l2, num_filters, inf, hidden_size):
    params.outfile = 'base_ner_inf_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = 10
    params.hidden = hidden_size
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.emb = 1
    params.inf = inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.en_hidden_size = hidden_size
    params.de_hidden_size = hidden_size
    params.lstm_layers_num = 1
    params.num_labels = 17
    params.layers_num = 3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    print tagger

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + '_dropout_' + str(
        params.dropout) + "_LearningRate" + '_' + str(
            params.eta) + '_inf_' + str(inf) + '_' + str(l2) + '_' + str(
                num_filters) + '_hidden_' + str(hidden_size)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    if (inf == 0) or (inf == 1):
        from base_ner_model_selection import base_model
        tm = base_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from seq2seq_att_ner import Seq2Seq
        tm = Seq2Seq(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from self_att_ner import Transformer
        tm = Transformer(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
Пример #23
0
    # Here the file 'sample' incude the tweets
    params.dataf = '../data/sample'
    params.hiddensize = 512
    params.window1 = 0
    params.taggerhiddensize = 512
    params.encodesize = 256
    # the context window size
    params.contextsize = 1

    (words, We) = getWordmap('../embeddings/wordvects.tw100w5-m40-it2')
    params.words = words

    params.embedsize = len(We[0])
    words.update({'<s>': 0})
    a = np.random.rand(len(We[0]))
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')

    tagger = getTagger('../data/tagger')
    params.tagger = tagger

    taggerlist = getTaggerList('../data/tagger')
    params.taggerlist = taggerlist

    wordlist = getWordlist('../embeddings/wordvects.tw100w5-m40-it2')
    params.wordlist = wordlist
    tm = aetagger_model(We, params)
def Base(eta, l3, emb, batchsize, inf, hidden_inf):
    params.outfile = 'h_pos_CRF_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0
    params.hidden_inf = hidden_inf
    params.small = 0

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    #params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_dropout_'+ str(params.dropout) + '_LearningRate_'+str(params.eta)+ '_'  + str(l3) +'_emb_'+ str(emb)+ '_inf_'+ str(params.inf)+ '_regutype_'+ str(params.regutype)+ '_annealing_'+ str(params.annealing)

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_inf_' + str(params.hidden_inf)

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    if (inf == 0) or (inf == 1):
        tm = CRF_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 2):
        from model_selection_inference_seq2seq_h import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        #params.de_hidden_size = 50
        params.outfile = 'h_pos_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 3):
        from model_selection_inference_seq2seq_h_beamsearch import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        #params.de_hidden_size = 50
        params.outfile = 'h_pos_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
Пример #25
0
def Base(eta, l2, num_filters, inf, tagversion, hidden):
    params.outfile = 'base_ccctag_inf_'
    params.dataf = '../supertag_data/train.dat'
    params.dev = '../supertag_data/dev.dat'
    params.test = '../supertag_data/test.dat'
    params.batchsize = 10
    params.hidden = hidden
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.emb = 1
    params.inf = inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters

    params.en_hidden_size = hidden
    params.de_hidden_size = hidden
    params.lstm_layers_num = 1

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape

    if (tagversion == 0):
        tagger = getTagger('../supertag_data/tagger_100')
    elif (tagversion == 1):
        tagger = getTagger('../supertag_data/tagger_200')
    else:
        tagger = getTagger('../supertag_data/tagger_400')
    params.num_labels = len(tagger)

    char_dic = getTagger('../supertag_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".num_filters_" + str(
        num_filters) + '_LearningRate_' + str(
            params.eta) + '_inf_' + str(inf) + '_hidden_' + str(
                params.hidden) + '_' + str(tagversion) + '_' + str(l2)
    #examples are shuffled data

    train = getSupertagData_and_Char(params.dataf, words, tagger, char_dic)

    dev = getSupertagData_and_Char(params.dev,
                                   words,
                                   tagger,
                                   char_dic,
                                   train=False)

    test = getSupertagData_and_Char(params.test,
                                    words,
                                    tagger,
                                    char_dic,
                                    train=False)

    if (inf == 0) or (inf == 1):
        from base_model_selection_ccctag import base_model
        tm = base_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from seq2seq_att_pos import Seq2Seq
        #from seq2seq_att_pos_new import Seq2Seq
        tm = Seq2Seq(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
    elif (inf == 3):
        ##from seq2seq_att_pos_beamsearch import Seq2Seq
        from seq2seq_att_pos_new_beamsearch import Seq2Seq
        tm = Seq2Seq(We, params)
        tm.train(traindata, devdata, testdata, params)