示例#1
0
def Base(eta, l2):
    params.outfile = 'Base_model'
    params.dataf = 'data/oct27.traindev.proc.cnn'
    params.dev = 'data/oct27.test.proc.cnn'
    params.test = 'data/daily547.proc.cnn'
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.frac = 0.1
    params.emb = 0

    (words, We) = getWordmap('wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    tagger = getTagger('data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_' + "LearningRate" + '_' + str(
            params.eta) + '_' + str(
                params.hiddensize) + '_' + str(l2) + '.pickle'

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    tm = base_model(We, params)
    tm.train(traindata, devdata, testdata, params)
def LModel(eta,batchsize,dSize,relSize, updatewords):
	trainSize = [50]

	acti = ['relu','tanh']
	evaT = ['sum','max','cause']

	layersize =dSize

	params.frac = 1.0
	params.outfile = 'Model_FA'+'_eta_'+str(eta)+'_dSize_'+ str(dSize) + '_batchsize_'+ str(batchsize) + '_relSize_'+ str(relSize) + '_trainSize_'+str(trainSize[0]) + '_updatewords_' + str(updatewords)
#params.dataf = '../data/conceptnet/AddModelData/omcs_train_new'+str(trainSize[0])+'.txt'
	#params.dataf = '../data/conceptnet/AddModelData/causes_omcs.txt'
	params.dataf = '../data/conceptnet/AddModelData/new_omcs100.txt'
	params.batchsize = batchsize
	params.hiddensize = 25
	params.type = "MAX"
	params.save = True
	params.constraints = False
	params.embedsize = dSize
	params.relsize = relSize
	params.activation = acti[0]
	params.evaType = evaT[0]
	params.usepeep = True
	params.LC = 0.00001
	params.Lw = 0.01
	params.eta = eta
	params.margin = 1
	params.save= True

	(words, We) = getWordmap('../data/conceptnet/embeddings/embeddings.skip.newtask.en.d'+str(dSize)+'.m1.w5.s0.it20.txt')
	#print We.shape
	rel = getRelation('../data/conceptnet/rel.txt')
	params.outfile = "../models/"+params.outfile+"_"+str(params.LC)+"_"+str(params.Lw)+".txt"
                                #examples are shuffled data
	examples = getData(params.dataf)

	params.data = examples[0:int(params.frac*len(examples))]

	#print "Using Training Data"+params.dataf
	#print "Using Word Embeddings with Dimension "+str(dSize[0])

	#print "Training on "+str(len(params.data))
	#print "Saving models to: "+params.outfile

	Rel_init = np.zeros((35,params.relsize,params.relsize))
	for k in range(35):
		for i in range(params.relsize):
         		for j in range(params.relsize):
                  		if(i==j):
                          		Rel_init[k][i][j] = 1+random.uniform(-0.2,0.2)
                  		else:
                          		Rel_init[k][i][j] = random.uniform(-0.2,0.2)

	tm = theano_word_model(We, words, layersize, params.embedsize, rel, params.relsize, Rel_init, params.LC, params.Lw, params.eta, params.margin, params.usepeep, updatewords)
	tm.train( params.data, params)
def Base(eta, l2, num_filters, emb, hidden):
	params.outfile = 'POS_Bilstm_CNN_CRF_'
	params.dataf = '../pos_data/oct27.traindev.proc.cnn'
	params.dev = '../pos_data/oct27.test.proc.cnn'
	params.test = '../pos_data/daily547.proc.cnn'
	params.batchsize = 10
	params.hidden = hidden
	params.embedsize = 100
	params.emb = emb
	params.eta = eta
	params.L2 = l2
	params.dropout = 1
	params.num_labels = 25
        params.char_embedd_dim = 30
        params.num_filters = num_filters    

	(words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
	#words.update({'UUUNKKK':0})
	#a=[0]*len(We[0])
	#newWe = []
	#newWe.append(a)
	#We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../pos_data/tagger')
       
        char_dic = getTagger('../pos_data/char_dic')

        params.char_dic = char_dic

        scale = np.sqrt(3.0 / params.char_embedd_dim)
        char_embedd_table = np.random.uniform(-scale, scale, [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)
        
	print char_dic
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_dropout_'+ str(params.dropout) + "_LearningRate"+'_'+str(params.eta)+ '_' + str(l2)+ str(num_filters) + '_emb_'+ str(emb)+ '_hidden_'+ str(hidden)
                                #examples are shuffled data
	
	traindata = getData_and_Char(params.dataf, words, tagger, char_dic)
 	devdata = getData_and_Char(params.dev, words, tagger, char_dic)
	testdata = getData_and_Char(params.test, words, tagger, char_dic)


	print 'test set', len(traindata[2])
	#print Y
	print "Using Training Data"+params.dataf
	print "Using Word Embeddings with Dimension "+str(params.embedsize)
	print "Saving models to: "+params.outfile
	#lm = LM_model(params)
	#lm.train(trainy0, devy0, params)	


	tm = CRF_model(We, char_embedd_table, params)
	tm.train(traindata, devdata, testdata, params)
示例#4
0
def Base(eta, epoches):
    params.outfile = 'Pos_sgd_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    num_filters = 30
    emb = 1
    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1

    params.hidden_inf = 300

    params.epoches = epoches

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    #print We.shape
    tagger = getTagger('../pos_data/tagger')
    #print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + '_LearningRate_' + str(
        params.eta) + '_epoches_' + str(epoches)
    print params.outfile

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    from model_selection_pos_finalTuning_sgd_inference import CRF_model
    tm = CRF_model(We, char_embedd_table, params)
    tm.train(train, dev, test, params)
def Base(eta, l2, morepara, emb, batchsize):
    params.outfile = 'POS_CRF_Bilstm_Viterbi_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'
    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.L2 = l2
    params.dropout = 0
    params.num_labels = 25

    params.morepara = morepara

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize
    ) + '_dropout_' + str(params.dropout) + "_LearningRate" + '_' + str(
        params.eta) + '_' + str(l2) + str(morepara) + '_emb_' + str(emb)
    #examples are shuffled data

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    tm = CRF_model(We, params)
    tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
示例#6
0
def Base(eta, epoches):
    params.outfile = 'Pos_sgd_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    l3 = 0
    emb = 0
    params.char_embedd_dim = 30

    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0

    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    params.epoches = epoches

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    #print We.shape
    tagger = getTagger('../pos_data/tagger')
    #print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    params.outfile = params.outfile + '_LearningRate_' + str(
        params.eta) + '_epoches_' + str(epoches)
    print params.outfile

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    from model_selection_sgd_simple_inference import CRF_model
    tm = CRF_model(We, params)
    tm.train(train, dev, test, params)
示例#7
0
parser.add_argument("-learner", help="Either AdaGrad or Adam.")
parser.add_argument("-num_examples", help="Number of examples to use in training. If not set, will use all examples.", type=int)

args = parser.parse_args()

params.LW = args.LW
params.outfile = args.outfile
params.batchsize = args.batchsize
params.dim = args.dim
params.wordfile = args.wordfile
params.save = str2bool(args.save)
params.train = args.train
params.margin = args.margin
params.type = args.samplingtype
params.epochs = args.epochs
params.evaluate = str2bool(args.evaluate)
params.learner = str2learner(args.learner)
params.learner = lasagne.updates.adagrad

(words, We) = getWordmap(params.wordfile)
examples = getData(params.train, words)

if args.num_examples:
    examples = examples[0:args.num_examples]

print "Number of training examples: ", len(examples)
print sys.argv

model = paragram_word_model(We, params)

train(model,examples,words,params)
示例#8
0
#learning rate
params.eta = float(sys.argv[7])
#size of hidden layer
params.hiddensize = int(sys.argv[8])
#way to sample negative examples. possible inputs: 'MIX','RAND','MAX'
params.type = str(sys.argv[9])
#use how many percentage of the training data, if it's 1, then it's the full data set.
params.frac = float(sys.argv[10])
params.outfile = 'DNN_Hinge'+'trainSize100dSize'+str(sys.argv[1])+'relSize'+str(sys.argv[2])+'acti'+str(sys.argv[3])
params.dataf = '../commonsendata/Training/new_omcs100.txt'
params.save = False
params.constraints = False
params.evaType = 'cause'
params.margin = 1

(words, We) = getWordmap('../commonsendata/embeddings/tuples/embeddings.skip.newtask.en.d'+str(sys.argv[1])+'.m1.w5.s0.it20.txt')
# print We.shape
rel = getRelation('../commonsendata/Training/rel.txt')
params.outfile = "../models/"+params.outfile+"."+str(params.lam)+"."+str(params.batchsize)+"."+params.type+"."+params.activation+".txt"

examples = getData(params.dataf)

params.data = examples[0:int(params.frac*len(examples))]

print "Using Training Data"+params.dataf
print "Using Word Embeddings with Dimension "+str(sys.argv[1])

print "Training on "+str(len(params.data))+" examples using lambda="+str(params.lam)
print "Saving models to: "+params.outfile

Rel_init = np.zeros((35,params.relsize))
    parser.add_argument("-learner",
                        help="Either AdaGrad or Adam.",
                        default='AdaGrad')

    args = parser.parse_args()

    params.LW = args.LW
    params.LC = args.LC
    params.eta = args.eta
    params.outfile = args.outfile
    params.batchsize = args.batchsize
    params.hiddensize = args.dim
    params.wordfile = args.wordfile
    params.layersize = args.layersize
    params.updatewords = str2bool(args.updatewords)
    params.wordstem = args.wordstem
    params.save = str2bool(args.save)
    params.train = args.train
    params.margin = args.margin
    params.clip = args.clip
    params.type = args.samplingtype
    params.epochs = args.epochs
    params.learner = learner2bool(args.learner)

    (words, We) = utils.getWordmap(params.wordfile)
    examples = utils.getData(params.train, words)

    model = word_ass_model(We, params)

    train(model, examples, words, params)
parser.add_argument(
    "--margin_type",
    help=
    "different traing method  0:margin rescaling, 1:contrastive, 2:perceptron, 3: slack rescaling",
    type=int,
    default=0)
params = parser.parse_args()

params.dataf = '../pos_data/oct27.traindev.proc.cnn'
params.dev = '../pos_data/oct27.test.proc.cnn'
params.test = '../pos_data/daily547.proc.cnn'

params.hidden = 100
params.embedsize = 100

(words, We) = getWordmap('wordvects.tw100w5-m40-it2')
We = np.asarray(We).astype('float32')
tagger = getTagger('../pos_data/tagger')
params.tagger = tagger
params.words = words
params.outfile = "ADV_CRF_LSTM_LM_Batchsize" + '_' + str(
    params.batchsize) + '_dropout_' + str(
        params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
            params.l2) + '_' + str(params.l3) + '_emb_' + str(params.emb)

traindata = getData(params.dataf, words, tagger)
trainx0, trainy0 = traindata
devdata = getData(params.dev, words, tagger)
devx0, devy0 = devdata
testdata = getData(params.test, words, tagger)
testx0, testy0 = testdata
def Base(eta, l3, emb, batchsize, inf, hidden_inf):
    params.outfile = 'h_pos_CRF_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    params.batchsize = batchsize
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 0
    params.hidden_inf = hidden_inf
    params.small = 0

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    #params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_dropout_'+ str(params.dropout) + '_LearningRate_'+str(params.eta)+ '_'  + str(l3) +'_emb_'+ str(emb)+ '_inf_'+ str(params.inf)+ '_regutype_'+ str(params.regutype)+ '_annealing_'+ str(params.annealing)

    params.outfile = params.outfile + ".Batchsize" + '_' + str(
        params.batchsize) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_inf_' + str(params.hidden_inf)

    traindata = getData(params.dataf, words, tagger)
    trainx0, trainy0 = traindata
    devdata = getData(params.dev, words, tagger)
    devx0, devy0 = devdata
    print 'dev set', len(devx0)
    testdata = getData(params.test, words, tagger)
    testx0, testy0 = testdata

    print 'test set', len(testx0)
    #print Y
    print "Using Training Data" + params.dataf
    print "Using Word Embeddings with Dimension " + str(params.embedsize)
    print "Saving models to: " + params.outfile
    #lm = LM_model(params)
    #lm.train(trainy0, devy0, params)

    if (inf == 0) or (inf == 1):
        tm = CRF_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 2):
        from model_selection_inference_seq2seq_h import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        #params.de_hidden_size = 50
        params.outfile = 'h_pos_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
    elif (inf == 3):
        from model_selection_inference_seq2seq_h_beamsearch import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        #params.de_hidden_size = 50
        params.outfile = 'h_pos_de_hidden_size_' + str(
            params.de_hidden_size) + '_' + params.outfile
        tm = CRF_seq2seq_model(We, params)
        tm.train(trainx0, trainy0, devx0, devy0, testx0, testy0, params)
示例#12
0
def Base(eta, l3, emb, num_filters, inf, hidden_inf):
    params.outfile = 'Pos_CRF_Inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.batchsize = 10
    params.hidden = 100
    params.embedsize = 100
    params.emb = emb
    params.eta = eta
    params.dropout = 1
    params.hidden_inf = hidden_inf
    params.small = 0

    params.inf = inf
    params.regutype = 0
    params.annealing = 0
    params.L3 = l3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger
    params.words = words
    params.tagger = tagger

    params.num_labels = len(tagger)

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_dropout_' + str(
            params.dropout) + '_LearningRate_' + str(params.eta) + '_' + str(
                l3) + '_emb_' + str(emb) + '_inf_' + str(
                    params.inf) + '_hidden_inf_' + str(params.hidden_inf)

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    if (inf == 0) or (inf == 1):
        from model_selection_inference import CRF_model
        tm = CRF_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from model_selection_inference_seq2seq import CRF_seq2seq_model

        params.de_hidden_size = hidden_inf
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from model_selection_inference_seq2seq_beamsearch import CRF_seq2seq_model
        params.de_hidden_size = hidden_inf
        tm = CRF_seq2seq_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
示例#13
0
def Base(eta, l2, inf, hidden_size):
	params.outfile = 'h_base_pos_inf_'
	params.dataf = '../pos_data/oct27.traindev.proc.cnn'
	params.dev = '../pos_data/oct27.test.proc.cnn'
	params.test = '../pos_data/daily547.proc.cnn'
	params.batchsize = 10
	params.hidden = hidden_size
	params.embedsize = 100
	params.eta = eta
	params.L2 = l2
	params.dropout = 0
	params.emb =0	
	params.inf = inf

	params.en_hidden_size= hidden_size

	"""
	change it later
	"""
	params.de_hidden_size= hidden_size
	params.lstm_layers_num =1
	params.num_labels = 25	

	(words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
	#words.update({'UUUNKKK':0})
	#a=[0]*len(We[0])
	#newWe = []
	#newWe.append(a)
	#We = newWe + We
	We = np.asarray(We).astype('float32')
	print We.shape
	tagger = getTagger('../pos_data/tagger')
	print tagger
	params.outfile = params.outfile+".Batchsize"+'_'+str(params.batchsize)+'_LearningRate_'+str(params.eta)+ '_inf_' +str(inf) +'_hidden_'+ str(params.hidden)+ '_' + str(l2)
                                #examples are shuffled data
	
	traindata = getData(params.dataf, words, tagger)
	trainx0, trainy0 = traindata
	#N = int(params.frac*len(trainx0))
	#traindata = trainx0[:N], trainy0[:N]
	
 	devdata = getData(params.dev, words, tagger)
	devx0, devy0 = devdata
	print 'dev set',  len(devx0)
	testdata = getData(params.test, words, tagger)
	testx0, testy0 = testdata	

	print 'test set', len(testx0)
	print "Using Training Data"+params.dataf
	print "Using Word Embeddings with Dimension "+str(params.embedsize)
	print "Saving models to: "+params.outfile
	
	if (inf ==0) or (inf==1):
		tm = base_model(We, params)
		tm.train(traindata, devdata, testdata, params)
	#elif(inf ==2):
	#	from seq2seq import Seq2Seq
	#	tm = Seq2Seq(We, params)
	#	tm.train(traindata, devdata, testdata, params)
	elif(inf ==2):
                from seq2seq_att_pos_h import Seq2Seq
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
	elif(inf ==3):
                from seq2seq_att_pos_h_beamsearch import Seq2Seq
                tm = Seq2Seq(We, params)
                tm.train(traindata, devdata, testdata, params)
示例#14
0
def Base(eta, l2, num_filters, inf, hidden_size):
    params.outfile = 'base_pos_inf_'
    params.dataf = '../pos_data/oct27.traindev.proc.cnn'
    params.dev = '../pos_data/oct27.test.proc.cnn'
    params.test = '../pos_data/daily547.proc.cnn'
    params.batchsize = 10
    params.hidden = hidden_size
    params.embedsize = 100
    params.eta = eta
    params.L2 = l2
    params.dropout = 1
    params.emb = 1
    params.inf = inf

    params.char_embedd_dim = 30
    params.num_filters = num_filters
    params.en_hidden_size = hidden_size
    """
	change it later
	"""
    params.de_hidden_size = hidden_size
    params.lstm_layers_num = 1
    params.num_labels = 25
    params.layers_num = 3

    (words, We) = getWordmap('../embedding/wordvects.tw100w5-m40-it2')
    #words.update({'UUUNKKK':0})
    #a=[0]*len(We[0])
    #newWe = []
    #newWe.append(a)
    #We = newWe + We
    We = np.asarray(We).astype('float32')
    print We.shape
    tagger = getTagger('../pos_data/tagger')
    print tagger

    char_dic = getTagger('../pos_data/char_dic')

    params.char_dic = char_dic

    scale = np.sqrt(3.0 / params.char_embedd_dim)
    char_embedd_table = np.random.uniform(
        -scale, scale,
        [len(char_dic), params.char_embedd_dim]).astype(theano.config.floatX)

    params.outfile = params.outfile + ".num_filters" + '_' + str(
        num_filters) + '_LearningRate_' + str(
            params.eta) + '_inf_' + str(inf) + '_hidden_' + str(
                params.hidden) + '_' + str(l2)

    train = getData_and_Char(params.dataf, words, tagger, char_dic)
    dev = getData_and_Char(params.dev, words, tagger, char_dic)
    test = getData_and_Char(params.test, words, tagger, char_dic)

    if (inf == 0) or (inf == 1):
        from base_model_selection import base_model
        tm = base_model(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 2):
        from seq2seq_att_pos import Seq2Seq
        tm = Seq2Seq(We, char_embedd_table, params)
        tm.train(train, dev, test, params)

    elif (inf == 3):
        from self_att import Transformer
        tm = Transformer(We, char_embedd_table, params)
        tm.train(train, dev, test, params)
示例#15
0
    binaryScore = []
    Exp_S_sorted = sorted(Exp_S)
    for j in xrange(len(Exp_S)):
        temp_thr = Exp_S_sorted[j]
        for j1 in xrange(int(len(Exp_S) / 2)):
            if (Exp_S[j1] >= temp_thr):
                right = right + 1
            else:
                wrong = wrong + 1
        for j2 in xrange(int(len(Exp_S) / 2), int(len(Exp_S)), 1):
            if (Exp_S[j2] <= temp_thr):
                right = right + 1
            else:
                wrong = wrong + 1
        if ((right / (len(Exp_S))) > accurancy):
            accurancy = (1.0 * right / (len(Exp_S)))
            threshold = temp_thr
        right = 0
        wrong = 0

    #print 'Dev1-Accurancy',accurancy
    return threshold, accurancy


if __name__ == "__main__":
    (words, We) = getWordmap('../data/conceptnet/embeddings.txt')
    tm = theano_word_model(We)
    rel = getRelation('../data/conceptnet/rel.txt')
    Rel = tm.getRel()
    evaluate_adagrad(We, words, Rel, rel)
示例#16
0
warnings.filterwarnings("ignore")
params = params()

if __name__ == "__main__":

    # Here the file 'sample' incude the tweets
    params.dataf = '../data/sample'
    params.hiddensize = 512
    params.window1 = 0
    params.taggerhiddensize = 512
    params.encodesize = 256
    # the context window size
    params.contextsize = 1

    (words, We) = getWordmap('../embeddings/wordvects.tw100w5-m40-it2')
    params.words = words

    params.embedsize = len(We[0])
    words.update({'<s>': 0})
    a = np.random.rand(len(We[0]))
    newWe = []
    newWe.append(a)
    We = newWe + We
    We = np.asarray(We).astype('float32')

    tagger = getTagger('../data/tagger')
    params.tagger = tagger

    taggerlist = getTaggerList('../data/tagger')
    params.taggerlist = taggerlist
示例#17
0
    num_unks = 0
    for i in examples:
        (v1,t1) = utils.lookup_with_unk(We,words,i[0])
        (v2,t2) = utils.lookup_with_unk(We,words,i[1])
        pred.append(-1*cosine(v1,v2)+1)
        if t1:
            num_unks += 1
        if t2:
            num_unks += 1
        gold.append(i[2])
    return (spearmanr(pred,gold)[0], num_unks)

def evaluateWordSim(We, words):
    ws353ex = read_data('../data/wordsim353.txt')
    ws353sim = read_data('../data/wordsim-sim.txt')
    ws353rel = read_data('../data/wordsim-rel.txt')
    simlex = read_data('../data/SimLex-999.txt')
    (c1,u1) = getCorrelation(ws353ex,We,words)
    (c2,u2) = getCorrelation(ws353sim,We,words)
    (c3,u3) = getCorrelation(ws353rel,We,words)
    (c4,u4) = getCorrelation(simlex,We,words)
    return ([c1,c2,c3,c4],[u1,u2,u3,u4])

def evaluate_all(model,words):
    (corr, unk) = evaluateWordSim(model.all_params[0].get_value(),words)
    s="{0} {1} {2} {3} ws353 ws-sim ws-rel sl999".format(corr[0], corr[1], corr[2], corr[3])
    print s

if __name__ == "__main__":
    (words, We) = utils.getWordmap('../data/glove_small.txt')
    print evaluateWordSim(We, words)
示例#18
0
        pred.append(-1 * cosine(v1, v2) + 1)
        if t1:
            num_unks += 1
        if t2:
            num_unks += 1
        gold.append(i[2])
    return (spearmanr(pred, gold)[0], num_unks)


def evaluateWordSim(We, words):
    ws353ex = read_data('../data/wordsim353.txt')
    ws353sim = read_data('../data/wordsim-sim.txt')
    ws353rel = read_data('../data/wordsim-rel.txt')
    simlex = read_data('../data/SimLex-999.txt')
    (c1, u1) = getCorrelation(ws353ex, We, words)
    (c2, u2) = getCorrelation(ws353sim, We, words)
    (c3, u3) = getCorrelation(ws353rel, We, words)
    (c4, u4) = getCorrelation(simlex, We, words)
    return ([c1, c2, c3, c4], [u1, u2, u3, u4])


def evaluate_all(model, words):
    (corr, unk) = evaluateWordSim(model.all_params[0].get_value(), words)
    s = "{0} {1} {2} {3} ws353 ws-sim ws-rel sl999".format(
        corr[0], corr[1], corr[2], corr[3])
    print s


if __name__ == "__main__":
    (words, We) = utils.getWordmap('../data/glove_small.txt')
    print evaluateWordSim(We, words)
示例#19
0
params.outfile = 'Bilinear_Hinge' + 'trainSize100dSize' + str(
    sys.argv[1]) + 'relSize' + str(sys.argv[2]) + 'acti' + str(sys.argv[3])
params.dataf = '../commonsendata/Training/new_omcs100.txt'
#if you want to save the model, just change this to 'True'
params.save = False
params.constraints = False
params.activation = 'tanh'
params.evaType = 'cause'
params.usepeep = True
params.margin = 1

# (words, We) = getWordmap('../commonsendata/embeddings/tuples/embeddings.skip.newtask.en.d'+str(sys.argv[1])+'.m1.w5.s0.it20.txt')
# print We.shape
# if downloading data from http://ttic.uchicago.edu/~kgimpel/commonsense.html
(words, We) = getWordmap('../commonsendata/embeddings/embeddings.txt')
rel = getRelation('../commonsendata/Training/rel.txt')
params.outfile = "../models/" + params.outfile + "." + str(
    params.lam) + "." + str(
        params.batchsize
    ) + "." + params.type + "." + params.activation + "." + str(
        params.frac) + ".txt"

#Examples are shuffled data
examples = getData(params.dataf)

params.data = examples[0:int(params.frac * len(examples))]

print "Using Training Data" + params.dataf
print "Using Word Embeddings with Dimension " + str(sys.argv[1])
示例#20
0
)

args = parser.parse_args()

params.LW = args.LW
params.outfile = args.outfile
params.batchsize = args.batchsize
params.dim = args.dim
params.wordfile = args.wordfile
params.save = str2bool(args.save)
params.train = args.train
params.margin = args.margin
params.type = args.samplingtype
params.epochs = args.epochs
params.evaluate = str2bool(args.evaluate)
params.learner = str2learner(args.learner)
params.learner = lasagne.updates.adagrad

(words, We) = getWordmap(params.wordfile)
examples = getData(params.train, words)

if args.num_examples:
    examples = examples[0 : args.num_examples]

print "Number of training examples: ", len(examples)
print sys.argv

model = paragram_word_model(We, params)

train(model, examples, words, params)