def Base(eta, l2, morepara, emb, batchsize):
    params.outfile = 'NER_CRF_lstm_Viterti_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'
    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.num_labels = 17

    params.morepara = morepara

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')
    print tagger
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb)
    #examples are shuffled data

    trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger)
    traindata = trainx0, trainy0
    #N = int(params.frac*len(trainx0))
    #traindata = trainx0[:N], trainy0[:N]

    devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes(
        params.dev, words, tagger)
    devdata = devx0, devy0
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes(
        params.test, words, tagger)
    testdata = testx0, testy0

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
def Base(eta, l2, num_filters, emb, hidden):
    params.outfile = 'NER_BiLSTM_CNN_CRF_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'
    params.batchsize = 10
    params.hidden = hidden
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.num_labels = 17
    params.char_embedd_dim = 30
    params.num_filters = num_filters

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')
    print tagger
    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + "_LearningRate" + '_' + str(
                params.eta) + '_' + str(l2) + '_' + str(
                    num_filters) + '_hidden_' + str(hidden)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile

    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #3
0
def Base(eta, l2, num_filters, inf, hidden_size):
    params.outfile = 'base_ner_inf_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = 10
    params.hidden = hidden_size
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.emb = 1
    params.inf = inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.en_hidden_size = hidden_size
    params.de_hidden_size = hidden_size
    params.lstm_layers_num = 1
    params.num_labels = 17
    params.layers_num = 3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../ner_data/ner_bioes')

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    print tagger

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
    params.outfile = params.outfile + '_dropout_' + str(
        params.dropout) + "_LearningRate" + '_' + str(
            params.eta) + '_inf_' + str(inf) + '_' + str(l2) + '_' + str(
                num_filters) + '_hidden_' + str(hidden_size)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    if (inf == 0) or (inf == 1):
        from base_ner_model_selection import base_model
        tm = base_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from seq2seq_att_ner import Seq2Seq
        tm = Seq2Seq(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from self_att_ner import Transformer
        tm = Transformer(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
Пример #4
0
def Base(eta, l2, inf, hidden_size):
	params.outfile = 'h_base_ner_inf_'
	params.dataf = '../ner_data/eng.train.bioes.conll'
        params.dev = '../ner_data/eng.dev.bioes.conll'
        params.test = '../ner_data/eng.test.bioes.conll'

	params.batchsize = 10
	params.hidden = hidden_size
	params.embedsize = 100
	params.eta = eta
	params.L2 = l2
	params.dropout = 0
	params.emb =0	
	params.inf = inf

	params.en_hidden_size= hidden_size
	params.de_hidden_size= hidden_size
	params.lstm_layers_num =1
	params.num_labels = 17	

	(words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
	words.update({'UUUNKKK':0})
	a=[0]*len(We[0])
	newWe = []
	newWe.append(a)
	We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../ner_data/ner_bioes')

	params.taggerlist = getTaggerlist('../ner_data/ner_bioes')
	print tagger
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_LearningRate_'+str(params.eta)+ '_inf_' +str(inf) + '_' + str(l2) + '_'+ str(hidden_size)
                                #examples are shuffled data
	trainx0, trainy0, _ , _ = Get_Ner_bioes(params.dataf, words, tagger)
        traindata = trainx0, trainy0
        #N = int(params.frac*len(trainx0))
        #traindata = trainx0[:N], trainy0[:N]


        devx0, devy0,  params.devrawx, params.devpos = Get_Ner_bioes(params.dev, words, tagger)
        devdata = devx0, devy0
        print devy0[:10]
        print 'dev set',  len(devx0)
        testx0, testy0, params.testrawx, params.testpos  = Get_Ner_bioes(params.test, words, tagger)
        testdata = testx0, testy0


        print 'test set', len(testx0)
        #print Y
        print "Using Training Data"+params.dataf
        print "Using Word Embeddings with Dimension "+str(params.embedsize)
        print "Saving models to: "+params.outfile

	
		
	if (inf ==0) or (inf==1):
		tm = base_model(We, params)
		tm.train(traindata, devdata, testdata, params)
	#elif(inf ==2):
	#	from seq2seq import Seq2Seq
	#	tm = Seq2Seq(We, params)
	#	tm.train(traindata, devdata, testdata, params)
	elif(inf ==2):
                from seq2seq_att_ner_h import Seq2Seq
		#from seq2seq_att_ner_beamsearch import Seq2Seq
		#params.de_hidden_size=200
		#params.outfile = 'de_hidden_200_' + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
	elif(inf ==3):
                #from seq2seq_att_ner import Seq2Seq
                from seq2seq_att_ner_h_beamsearch import Seq2Seq
                #params.de_hidden_size=200
                #params.outfile = 'de_hidden_200_' + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)

	elif(inf ==4):
                #from seq2seq_att_all import Seq2Seq
		from seq2seq_local_att_ner import Seq2Seq

		params.window =int(sys.argv[5])
		params.outfile = 'local_att_window_' + str(params.window)+ '_attweight_' +  sys.argv[6] + params.outfile
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
Пример #5
0
def Base(eta, l3, emb, num_filters, inf, hidden_inf):
    params.outfile = 'CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = 10
    params.hidden = 200
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1
    params.hidden_inf = hidden_inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters

    params.inf = inf
    params.regutype = 0
    params.annealing = 1
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters
    ) + '_dropout_' + str(params.dropout) + '_LearningRate_' + str(
        params.eta) + '_' + str(l3) + '_emb_' + str(emb) + '_inf_' + str(
            params.inf) + '_hidden_' + str(
                params.hidden_inf) + '_annealing_' + str(params.annealing)

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char

    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    if (inf == 0) or (inf == 1):
        from model_selection_NER_inference import CRF_model
        tm = CRF_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from model_selection_inference_NER_seq2seq import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    else:
        from model_selection_inference_NER_seq2seq_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        #params.outfile = 'de_hidden_' + str(params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
def Base(eta, l3, epoches, warmstart):
    params.outfile = 'CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    emb = 1
    params.batchsize = 10
    params.hidden = 200
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1

    params.char_embedd_dim = 30
    num_filters = 50
    params.num_filters = num_filters
    params.epoches = epoches

    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    params.hidden_inf = 200

    params.WarmStart = warmstart

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    char_dic = getTagger('../ner_data/char_dic')
    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_LearningRate_' + str(
            params.eta) + '_' + str(l3) + '_emb_' + str(emb)
    print params.outfile

    trainx0, trainx0_char, trainy0, _, _ = Get_Ner_bioes_and_Char(
        params.dataf, words, tagger, char_dic)
    train = trainx0, trainy0, trainx0_char

    devx0, devx0_char, devy0, params.devrawx, params.devpos = Get_Ner_bioes_and_Char(
        params.dev, words, tagger, char_dic)
    dev = devx0, devy0, devx0_char

    testx0, testx0_char, testy0, params.testrawx, params.testpos = Get_Ner_bioes_and_Char(
        params.test, words, tagger, char_dic)
    test = testx0, testy0, testx0_char

    from model_selection_NER_sgd_inference import CRF_model
    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
Пример #7
0
def Base(eta, l3, emb, batchsize, inf, hidden_inf):
    params.outfile = 'h_CRF_Inf_NER_'
    params.dataf = '../ner_data/eng.train.bioes.conll'
    params.dev = '../ner_data/eng.dev.bioes.conll'
    params.test = '../ner_data/eng.test.bioes.conll'

    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0
    params.hidden_inf = hidden_inf

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getGloveWordmap('../embedding/glove.6B.100d.txt')
    words.update({'UUUNKKK': 0})
    a = [0] * len(We[0])
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')
    tagger = getTagger('../ner_data/ner_bioes')
    params.taggerlist = getTaggerlist('../ner_data/ner_bioes')

    params.words = words
    params.tagger = tagger

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_' + str(params.hidden_inf)

    trainx0, trainy0, _, _ = Get_Ner_bioes(params.dataf, words, tagger)
    traindata = trainx0, trainy0

    devx0, devy0, params.devrawx, params.devpos = Get_Ner_bioes(
        params.dev, words, tagger)
    devdata = devx0, devy0
    print devy0[:10]
    print 'dev set', len(devx0)
    testx0, testy0, params.testrawx, params.testpos = Get_Ner_bioes(
        params.test, words, tagger)
    testdata = testx0, testy0

    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    if (inf == 0) or (inf == 1):
        from model_selection_NER_inference import CRF_model
        tm = CRF_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 2):
        from model_selection_inference_NER_seq2seq_h import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        params.outfile = 'h_de_hidden_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)

    else:
        from model_selection_inference_NER_seq2seq_h_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        params.outfile = 'h_de_hidden_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)