def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"
    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                 sentenceLayerNodesNum=50, \
                 sentenceLayerNodesSize=[5, 200], \
                 docLayerNodesNum=10, \
                 docLayerNodesSize=[3, 50],
                 pooling_mode=pooling_mode)
    # 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
    # 													 sentenceLayerNodesNum=100, \
    # 													 sentenceLayerNodesSize=[5, 200], \
    # 													 docLayerNodesNum=100, \
    # 													 docLayerNodesSize=[3, 100],
    # 													 pooling_mode=pooling_mode)

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=10,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2)

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.

    # 	data_name = "car"

    para_path = "data/" + data_name + "/model/" + pooling_mode + ".model"
    traintext = "data/" + data_name + "/train/text"
    trainlabel = "data/" + data_name + "/train/label"
    testtext = "data/" + test_dataname + "/test/text"
    testlabel = "data/" + test_dataname + "/test/label"

    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        learning_rate = 0.1
        error = layer2.errors(docLabel)
        cost = layer2.negative_log_likelihood(docLabel)

        grads = T.grad(cost, params)

        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        print "Loading test data."
        cr_test = CorpusReader(minDocSentenceNum=5,
                               minSentenceWordNum=5,
                               dataset=testtext,
                               labelset=testlabel)
        validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus(
            [0, 1000])

        # 		print "Right answer: "
        # 		print zip(validIds, validLabels)

        validDocMatrixes = transToTensor(validDocMatrixes,
                                         theano.config.floatX)
        validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
        validSentenceWordNums = transToTensor(validSentenceWordNums,
                                              numpy.int32)
        validLabels = transToTensor(validLabels, numpy.int32)
        print "Data loaded."

        valid_model = theano.function(
            [], [
                cost, error, layer2.y_pred, docLabel,
                T.transpose(layer2.p_y_given_x)[1]
            ],
            givens={
                corpus: validDocMatrixes,
                docSentenceCount: validDocSentenceNums,
                sentenceWordCount: validSentenceWordNums,
                docLabel: validLabels
            })

        # ####Validate the model####
        costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
        print "Valid current model:"
        print "Cost: ", costNum
        print "Error: ", errorNum
        # 		print "Valid Pred: ", pred_label
        # 		print "pred_prob: ", pred_prob

        fpr, tpr, _ = roc_curve(real_label, pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "test_dataname: ", test_dataname
        print "ROC: ", roc_auc

        fpr, tpr, threshold = roc_curve(real_label, pred_label)

        index_of_one = list(threshold).index(1)
        ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
        print "TPR: ", tpr[index_of_one]
        print "FPR: ", fpr[index_of_one]
        print "AR: ", ar
        print "threshold: ", threshold[index_of_one]
        if mode == "test":
            valid_model.free()
            return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar

        print "Loading train data."
        cr_train = CorpusReader(minDocSentenceNum=5,
                                minSentenceWordNum=5,
                                dataset=traintext,
                                labelset=trainlabel)
        docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus(
            [0, 100000])

        # 		print "Right answer: "
        # 		print zip(ids, labels)

        docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
        docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
        sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
        labels = transToTensor(labels, numpy.int32)

        # 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
        print
        index = T.lscalar("index")
        batchSize = 10
        n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1
        print
        print "Train set size is ", len(docMatrixes.get_value())
        print "Validating set size is ", len(validDocMatrixes.get_value())
        print "Batch size is ", batchSize
        print "Number of training batches  is ", n_batches

        print "Compiling computing graph."

        # for list-type data
        train_model = theano.function(
            [index], [cost, error, layer2.y_pred, docLabel],
            updates=updates,
            givens={
                corpus:
                docMatrixes,
                docSentenceCount:
                docSentenceNums[index * batchSize:(index + 1) * batchSize + 1],
                sentenceWordCount:
                sentenceWordNums,
                docLabel:
                labels[index * batchSize:(index + 1) * batchSize]
            })

        print "Compiled."
        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(n_batches):
                # for list-type data
                costNum, errorNum, pred_label, real_label = train_model(i)
                ite = ite + 1
                # for padding data
                # 			costNum, errorNum = train_model(docMatrixes, labels)
                # 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
                # print ".",
                if (ite % 10 == 0):
                    print
                    print "@iter: ", ite
                    print "Cost: ", costNum
                    print "Error: ", errorNum

            # Validate the model
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model(
            )
            print "Valid current model:"
            print "Cost: ", costNum
            print "Error: ", errorNum
            # 			print "pred_prob: ", pred_prob
            # 			print "Valid Pred: ", pred_label

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "test_dataname: ", test_dataname
            print "ROC: ", roc_auc

            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
        valid_model.free()
        train_model.free()
    elif (mode == "deploy"):
        print "Compiling computing graph."
        output_model = theano.function(
            [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred])
        print "Compiled."
        cr = CorpusReader(minDocSentenceNum=5,
                          minSentenceWordNum=5,
                          dataset="data/train_valid/split")
        count = 21000
        while (count <= 21000):
            docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus(
                [count, count + 100])
            docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
            docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32)
            sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
            print "start to predict."
            pred_y = output_model(docMatrixes, docSentenceNums,
                                  sentenceWordNums)
            print "End predicting."
            print "Writing resfile."
            # 		print zip(ids, pred_y[0])
            f = file("data/test/res/res" + str(count), "w")
            f.write(str(zip(ids, pred_y[0])))
            f.close()
            print "Written." + str(count)
            count += 100
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
	print "mode: ", mode
	print "data_name: ", data_name
	print "pooling_mode: ", pooling_mode
	print "Started!"
	
	data_names = data_name.split(":")
	data_count = len(data_names)
	print "Train dataset:"
	for i in xrange(data_count):
		print "%d: %s" % (i, data_names[i])
		
	print "Test dataset:"
	test_data_names = test_dataname.split(":")
	test_data_count = len(test_data_names)
	for i in xrange(test_data_count):
		print "%d: %s" % (i, test_data_names[i])
	
	if test_data_count != data_count:
		raise Exception("The amount of test and train dataset must be the same.")
	
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel')
	
	hidden_layer_w = None
	hidden_layer_b = None
	logistic_layer_w = None
	logistic_layer_b = None
	layer0 = list()
	layer1 = list()
	layer2 = list()
	local_params = list()
	# for list-type data
	for i in xrange(data_count):
		layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
														 sentenceLayerNodesNum=50, \
														 sentenceLayerNodesSize=[5, 200], \
														 docLayerNodesNum=10, \
														 docLayerNodesSize=[3, 50],
														 pooling_mode=pooling_mode))

		layer1.append(HiddenLayer(
			rng,
			input=layer0[i].output,
			n_in=layer0[i].outputDimension,
			n_out=10,
			activation=T.tanh,
			W=hidden_layer_w,
			b=hidden_layer_b
		))
		
# 		hidden_layer_w = layer1[i].W
# 		hidden_layer_b = layer1[i].b
	
		layer2.append(LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b))
		logistic_layer_w = layer2[i].W
		logistic_layer_b = layer2[i].b
		
		local_params.append(layer0[i].params + layer1[i].params)
	
	share_params = list(layer2[0].params)
	# construct the parameter array.
	params = list(layer2[0].params)
	
	for i in xrange(data_count):
		params += layer1[0].params + layer0[i].params
		
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/log_model/" + pooling_mode + ".model"
	traintext = ["data/" + data_names[i] + "/train/text"  for i in xrange(data_count)]
	trainlabel = ["data/" + data_names[i] + "/train/label"  for i in xrange(data_count)]
	testtext = ["data/" + test_data_names[i] + "/test/text"  for i in xrange(data_count)]
	testlabel = ["data/" + test_data_names[i] + "/test/label"  for i in xrange(data_count)]
	
	# Load the parameters last time, optionally.
	loadParamsVal(para_path, params)

	if(mode == "train" or mode == "test"):
		train_model = list()
		valid_model = list()
		print "Loading train data."
		batchSize = 10
		share_learning_rate = 0.01
		local_learning_rate = 0.1
		n_batches = list()
		
		print "Loading test data."
		
		for i in xrange(data_count):
			cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i])
			docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus([0, 100000])
			
			docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
			docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
			sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
			labels = transToTensor(labels, numpy.int32)
			
			index = T.lscalar("index")
			
			n_batches.append((len(docSentenceNums.get_value())  - 1 - 1) / batchSize + 1)
			print "Dataname: %s" % data_names[i]
			print "Train set size is ", len(docMatrixes.get_value())
			print "Batch size is ", batchSize
			print "Number of training batches  is ", n_batches[i]
			error = layer2[i].errors(docLabel)
			cost = layer2[i].negative_log_likelihood(docLabel)
			
		
			share_grads = T.grad(cost, share_params)
			share_updates = [
				(param_i, param_i - share_learning_rate * grad_i)
				for param_i, grad_i in zip(share_params, share_grads)
			]
			
			grads = T.grad(cost, local_params[i])
			local_updates = [
				(param_i, param_i - local_learning_rate * grad_i)
				for param_i, grad_i in zip(local_params[i], grads)
			]
			updates = share_updates + local_updates
			print "Compiling train computing graph."
			if mode == "train":
				train_model.append(theano.function(
			 		[index],
			 		[cost, error, layer2[i].y_pred, docLabel],
			 		updates=updates,
			 		givens={
									corpus: docMatrixes,
									docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
									sentenceWordCount: sentenceWordNums,
									docLabel: labels[index * batchSize: (index + 1) * batchSize]
								}
		 		))
			print "Compiled."
			
			print "Load test dataname: %s" % test_data_names[i]
			cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i])
			validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus([0, 1000])
			validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX)
			validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
			validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32)
			validLabels = transToTensor(validLabels, numpy.int32)
			print "Validating set size is ", len(validDocMatrixes.get_value())
			print "Data loaded."
			
			print "Compiling test computing graph."
			valid_model.append(theano.function(
		 		[],
		 		[cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]],
		 		givens={
								corpus: validDocMatrixes,
								docSentenceCount: validDocSentenceNums,
								sentenceWordCount: validSentenceWordNums,
								docLabel: validLabels
						}
		 	))
			print "Compiled."
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]()
			print "Valid current model :", data_names[i]
			print "Cost: ", costNum
			print "Error: ", errorNum
	 		
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "ROC: ", roc_auc
			fpr, tpr, threshold = roc_curve(real_label, pred_label)
			if 1 in threshold:
				index_of_one = list(threshold).index(1)
				print "TPR: ", tpr[index_of_one]
				print "FPR: ", fpr[index_of_one]
				print "threshold: ", threshold[index_of_one]

		if mode == "test":
			return

		print "Start to train."
		epoch = 0
		n_epochs = 10
		ite = 0
		
		# ####Validate the model####
# 		for dataset_index in xrange(data_count):
# 			costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]()
# 			print "Valid current model :", data_names[dataset_index]
# 			print "Cost: ", costNum
# 			print "Error: ", errorNum
# 	 		
# 			fpr, tpr, _ = roc_curve(real_label, pred_prob)
# 			roc_auc = auc(fpr, tpr)
# 			print "data_name: ", data_name
# 			print "ROC: ", roc_auc
# 			fpr, tpr, threshold = roc_curve(real_label, pred_label)
# 			index_of_one = list(threshold).index(1)
# 			print "TPR: ", tpr[index_of_one]
# 			print "FPR: ", fpr[index_of_one]
# 			print "threshold: ", threshold[index_of_one]
			
		while (epoch < n_epochs):
			epoch = epoch + 1
			#######################
			for i in range(max(n_batches)):
				for dataset_index in xrange(data_count):
					if i >= n_batches[dataset_index]:
						continue
					# for list-type data
					print "dataset_index: %d, i: %d" %(dataset_index, i)
					costNum, errorNum, pred_label, real_label = train_model[dataset_index](i)
					ite = ite + 1
					# for padding data
					if(ite % 10 == 0):
						print
						print "Dataset name: ", data_names[dataset_index]
						print "@iter: ", ite
						print "Cost: ", costNum
						print "Error: ", errorNum
						
			# Validate the model
			for dataset_index in xrange(data_count):
				costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]()
				print "Valid current model :", data_names[dataset_index]
				print "Cost: ", costNum
				print "Error: ", errorNum
		 		
				fpr, tpr, _ = roc_curve(real_label, pred_prob)
				roc_auc = auc(fpr, tpr)
				print "data_name: ", data_name
				print "ROC: ", roc_auc
					
				fpr, tpr, threshold = roc_curve(real_label, pred_label)
				index_of_one = list(threshold).index(1)
				print "TPR: ", tpr[index_of_one]
				print "FPR: ", fpr[index_of_one]
				print "threshold: ", threshold[index_of_one]
			# Save model
			print "Saving parameters."
			saveParamsVal(para_path, params)
			print "Saved."
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
	print "mode: ", mode
	print "data_name: ", data_name
	print "pooling_mode: ", pooling_mode
	print "Started!"
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=50, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=10, \
													 docLayerNodesSize=[3, 50],
													 pooling_mode=pooling_mode)
# 	layer0 = DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
# 													 sentenceLayerNodesNum=100, \
# 													 sentenceLayerNodesSize=[5, 200], \
# 													 docLayerNodesNum=100, \
# 													 docLayerNodesSize=[3, 100],
# 													 pooling_mode=pooling_mode)
	
	layer2 = LogisticRegression(input=layer0.output, n_in=10, n_out=2)

	# construct the parameter array.
	params = layer2.params  + layer0.params
	
	# Load the parameters last time, optionally.
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/model_nohidden/" + pooling_mode + ".model"
	traintext = "data/" + data_name + "/train/text"
	trainlabel = "data/" + data_name + "/train/label"
	testtext = "data/" + test_dataname + "/test/text"
	testlabel = "data/" + test_dataname + "/test/label"
	
	
	loadParamsVal(para_path, params)

	if(mode == "train"  or mode == "test"):
		learning_rate = 0.1
		error = layer2.errors(docLabel)
		cost = layer2.negative_log_likelihood(docLabel)
		
		grads = T.grad(cost, params)
	
		updates = [
			(param_i, param_i - learning_rate * grad_i)
			for param_i, grad_i in zip(params, grads)
		]
		
		print "Loading test data."
		cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel)
		validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus([0, 1000])
		
# 		print "Right answer: "
# 		print zip(validIds, validLabels)
		
		validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX)
		validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
		validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32)
		validLabels = transToTensor(validLabels, numpy.int32)
		print "Data loaded."
		
		valid_model = theano.function(
	 		[],
	 		[cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1]],
	 		givens={
							corpus: validDocMatrixes,
							docSentenceCount: validDocSentenceNums,
							sentenceWordCount: validSentenceWordNums,
							docLabel: validLabels
					}
	 	)
		
		# ####Validate the model####
		costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
		print "Valid current model:"
		print "Cost: ", costNum
		print "Error: ", errorNum
# 		print "Valid Pred: ", pred_label
# 		print "pred_prob: ", pred_prob
		
		fpr, tpr, _ = roc_curve(real_label, pred_prob)
		if mode == "test":
			print "tpr_all: ", tpr
			print "fpr_all: ", fpr
		roc_auc = auc(fpr, tpr)
		print "data_name: ", data_name
		print "test_dataname: ", test_dataname
		print "ROC: ", roc_auc
		
		fpr, tpr, threshold = roc_curve(real_label, pred_label)
		
		index_of_one = list(threshold).index(1)
		ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
		print "TPR: ", tpr[index_of_one]
		print "FPR: ", fpr[index_of_one]
		print "AR: ", ar
		print "threshold: ", threshold[index_of_one]
		if mode == "test":
			valid_model.free()
			return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar
		
		print "Loading train data."
		cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel)
		docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _  = cr_train.getCorpus([0, 100000])
		
# 		print "Right answer: "
# 		print zip(ids, labels)
		
		docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
		docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
		sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
		labels = transToTensor(labels, numpy.int32)
		
	# 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
		print
		index = T.lscalar("index")
		batchSize = 10
		n_batches = (len(docSentenceNums.get_value()) - 1) / batchSize + 1
		print
		print "Train set size is ", len(docMatrixes.get_value())
		print "Validating set size is ", len(validDocMatrixes.get_value())
		print "Batch size is ", batchSize
		print "Number of training batches  is ", n_batches
		
		print "Compiling computing graph."
		
		# for list-type data
		train_model = theano.function(
	 		[index],
	 		[cost, error, layer2.y_pred, docLabel],
	 		updates=updates,
	 		givens={
							corpus: docMatrixes,
							docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
							sentenceWordCount: sentenceWordNums,
							docLabel: labels[index * batchSize: (index + 1) * batchSize]
						}
	 	)
		
		print "Compiled."
		print "Start to train."
		epoch = 0
		n_epochs = 10
		ite = 0
			
		while (epoch < n_epochs):
			epoch = epoch + 1
			#######################
			for i in range(n_batches):
				# for list-type data
				costNum, errorNum, pred_label, real_label = train_model(i)
				ite = ite + 1
				# for padding data
	# 			costNum, errorNum = train_model(docMatrixes, labels)
	# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
				# print ".", 
				if(ite % 10 == 0):
					print
					print "@iter: ", ite
					print "Cost: ", costNum
					print "Error: ", errorNum
					
			# Validate the model
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
			print "Valid current model:"
			print "Cost: ", costNum
			print "Error: ", errorNum
# 			print "pred_prob: ", pred_prob
# 			print "Valid Pred: ", pred_label
			
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "test_dataname: ", test_dataname
			print "ROC: ", roc_auc
			
			fpr, tpr, threshold = roc_curve(real_label, pred_label)
			index_of_one = list(threshold).index(1)
			print "TPR: ", tpr[index_of_one]
			print "FPR: ", fpr[index_of_one]
			print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
			print "threshold: ", threshold[index_of_one]
			# Save model
			print "Saving parameters."
			saveParamsVal(para_path, params)
			print "Saved."
		valid_model.free()
		train_model.free()
	elif(mode == "deploy"):
		print "Compiling computing graph."
		output_model = theano.function(
	 		[corpus, docSentenceCount, sentenceWordCount],
	 		[layer2.y_pred]
	 	)
		print "Compiled."
		cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
		count = 21000
		while(count <= 21000):
			docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100])
			docMatrixes = numpy.matrix(
			            docMatrixes,
			            dtype=theano.config.floatX
			        )
			docSentenceNums = numpy.array(
			            docSentenceNums,
			            dtype=numpy.int32
			        )
			sentenceWordNums = numpy.array(
			            sentenceWordNums,
			            dtype=numpy.int32
			        )
			print "start to predict."
			pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums)
			print "End predicting."
			print "Writing resfile."
	# 		print zip(ids, pred_y[0])
			f = file("data/test/res/res" + str(count), "w")
			f.write(str(zip(ids, pred_y[0])))
			f.close()
			print "Written." + str(count)
			count += 100
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"

    data_names = data_name.split(":")
    data_count = len(data_names)
    print "Train dataset:"
    for i in xrange(data_count):
        print "%d: %s" % (i, data_names[i])

    print "Test dataset:"
    test_data_names = test_dataname.split(":")
    test_data_count = len(test_data_names)
    for i in xrange(test_data_count):
        print "%d: %s" % (i, test_data_names[i])

    if test_data_count != data_count:
        raise Exception(
            "The amount of test and train dataset must be the same.")

    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    sentenceW = None
    sentenceB = None
    docW = None
    docB = None

    hidden_layer_w = None
    hidden_layer_b = None
    logistic_layer_w = None
    logistic_layer_b = None
    layer0 = list()
    layer1 = list()
    layer2 = list()
    local_params = list()
    # for list-type data
    for i in xrange(data_count):
        layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                     sentenceLayerNodesNum=50, \
                     sentenceLayerNodesSize=[5, 200], \
                     docLayerNodesNum=10, \
                     docLayerNodesSize=[3, 50],
                     sentenceW=sentenceW,
                     sentenceB=sentenceB,
                     docW=docW,
                     docB=docB,
                     pooling_mode=pooling_mode))

        sentenceW = layer0[i].sentenceW
        sentenceB = layer0[i].sentenceB
        docW = layer0[i].docW
        docB = layer0[i].docB

        layer1.append(
            HiddenLayer(rng,
                        input=layer0[i].output,
                        n_in=layer0[i].outputDimension,
                        n_out=10,
                        activation=T.tanh,
                        W=hidden_layer_w,
                        b=hidden_layer_b))

        hidden_layer_w = layer1[i].W
        hidden_layer_b = layer1[i].b

        layer2.append(
            LogisticRegression(input=layer1[i].output,
                               n_in=10,
                               n_out=2,
                               W=logistic_layer_w,
                               b=logistic_layer_b))
        # 		logistic_layer_w = layer2[i].W
        # 		logistic_layer_b = layer2[i].b

        local_params.append(layer2[i].params)

    share_params = list(layer0[0].params + layer1[0].params)
    # construct the parameter array.
    params = list(layer0[0].params) + layer1[0].params

    for i in xrange(data_count):
        params += layer2[i].params

# 	data_name = "car"

    para_path = "data/" + data_name + "/share_hidden_low_model/" + pooling_mode + ".model"
    traintext = [
        "data/" + data_names[i] + "/train/text" for i in xrange(data_count)
    ]
    trainlabel = [
        "data/" + data_names[i] + "/train/label" for i in xrange(data_count)
    ]
    testtext = [
        "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)
    ]
    testlabel = [
        "data/" + test_data_names[i] + "/test/label"
        for i in xrange(data_count)
    ]

    # Load the parameters last time, optionally.
    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        train_model = list()
        valid_model = list()
        print "Loading train data."
        batchSize = 10
        share_learning_rate = 0.1
        local_learning_rate = 0.1
        n_batches = list()

        print "Loading test data."

        all_pred_label = list()
        all_real_label = list()
        all_pred_prob = list()
        for i in xrange(data_count):
            cr_train = CorpusReader(minDocSentenceNum=5,
                                    minSentenceWordNum=5,
                                    dataset=traintext[i],
                                    labelset=trainlabel[i])
            docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, _ = cr_train.getCorpus(
                [0, 100000])

            docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
            docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
            sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
            labels = transToTensor(labels, numpy.int32)

            index = T.lscalar("index")

            n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) /
                             batchSize + 1)
            print "Dataname: %s" % data_names[i]
            print "Train set size is ", len(docMatrixes.get_value())
            print "Batch size is ", batchSize
            print "Number of training batches  is ", n_batches[i]
            error = layer2[i].errors(docLabel)
            cost = layer2[i].negative_log_likelihood(docLabel)

            share_grads = T.grad(cost, share_params)
            share_updates = [
                (param_i, param_i - share_learning_rate * grad_i)
                for param_i, grad_i in zip(share_params, share_grads)
            ]

            grads = T.grad(cost, local_params[i])
            local_updates = [
                (param_i, param_i - local_learning_rate * grad_i)
                for param_i, grad_i in zip(local_params[i], grads)
            ]
            updates = share_updates + local_updates
            print "Compiling train computing graph."
            if mode == "train":
                train_model.append(
                    theano.function(
                        [index], [cost, error, layer2[i].y_pred, docLabel],
                        updates=updates,
                        givens={
                            corpus:
                            docMatrixes,
                            docSentenceCount:
                            docSentenceNums[index *
                                            batchSize:(index + 1) * batchSize +
                                            1],
                            sentenceWordCount:
                            sentenceWordNums,
                            docLabel:
                            labels[index * batchSize:(index + 1) * batchSize]
                        }))
            print "Compiled."

            print "Load test dataname: %s" % test_data_names[i]
            cr_test = CorpusReader(minDocSentenceNum=5,
                                   minSentenceWordNum=5,
                                   dataset=testtext[i],
                                   labelset=testlabel[i])
            validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, _ = cr_test.getCorpus(
                [0, 1000])
            validDocMatrixes = transToTensor(validDocMatrixes,
                                             theano.config.floatX)
            validDocSentenceNums = transToTensor(validDocSentenceNums,
                                                 numpy.int32)
            validSentenceWordNums = transToTensor(validSentenceWordNums,
                                                  numpy.int32)
            validLabels = transToTensor(validLabels, numpy.int32)
            print "Validating set size is ", len(validDocMatrixes.get_value())
            print "Data loaded."

            print "Compiling test computing graph."
            valid_model.append(
                theano.function(
                    [], [
                        cost, error, layer2[i].y_pred, docLabel,
                        T.transpose(layer2[i].p_y_given_x)[1]
                    ],
                    givens={
                        corpus: validDocMatrixes,
                        docSentenceCount: validDocSentenceNums,
                        sentenceWordCount: validSentenceWordNums,
                        docLabel: validLabels
                    }))
            print "Compiled."
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                i]()

            all_pred_label.extend(pred_label)
            all_real_label.extend(real_label)
            all_pred_prob.extend(pred_prob)

            print "Valid current model :", data_names[i]
            print "Cost: ", costNum
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            if 1 in threshold:
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

        print "Valid current model :", data_names
        errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
        print "Error: ", errorNum

        fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "ROC: ", roc_auc
        fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
        if 1 in threshold:
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]

        if mode == "test":
            return

        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(max(n_batches)):
                for dataset_index in xrange(data_count):
                    if i >= n_batches[dataset_index]:
                        continue
                    # for list-type data
                    costNum, errorNum, pred_label, real_label = train_model[
                        dataset_index](i)
                    ite = ite + 1
                    # for padding data
                    if (ite % 10 == 0):
                        print
                        print "Dataset name: ", data_names[dataset_index]
                        print "@iter: ", ite
                        print "Cost: ", costNum
                        print "Error: ", errorNum

            # Validate the model
            all_pred_label = list()
            all_real_label = list()
            all_pred_prob = list()
            for dataset_index in xrange(data_count):
                costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                    dataset_index]()

                all_pred_label.extend(pred_label)
                all_real_label.extend(real_label)
                all_pred_prob.extend(pred_prob)

                print "Valid current model :", data_names[dataset_index]
                print "Cost: ", costNum
                print "Error: ", errorNum

                fpr, tpr, _ = roc_curve(real_label, pred_prob)
                roc_auc = auc(fpr, tpr)
                print "data_name: ", data_name
                print "ROC: ", roc_auc

                fpr, tpr, threshold = roc_curve(real_label, pred_label)
                index_of_one = list(threshold).index(1)
                print "TPR: ", tpr[index_of_one]
                print "FPR: ", fpr[index_of_one]
                print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
                print "threshold: ", threshold[index_of_one]

            print "Valid current model :", data_names
            errorNum = 1 - accuracy_score(all_real_label, all_pred_label)
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc
            fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
def work(model_name, dataset_name, pooling_mode):
	print "model_name: ", model_name
	print "dataset_name: ", dataset_name
	print "pooling_mode: ", pooling_mode
	print "Started!"
	rng = numpy.random.RandomState(23455)
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
# 	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100],
													 pooling_mode=pooling_mode)

	layer1_output_num = 100
	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=layer1_output_num,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	cost = layer2.negative_log_likelihood(1 - layer2.y_pred)
		
	# calculate sentence sentence_score
	sentence_grads = T.grad(cost, layer0.sentenceResults)
	sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults)))
	
	# calculate word sentence_score against the whole network
	word_grad = T.grad(cost, corpus)
	word_score = T.diag(T.dot(word_grad, T.transpose(corpus)))
	
	# calculate word
	cell_scores = T.grad(cost, layer1.output)
	
	# calculate word score against cells
	word_score_against_cell = [T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num)]

	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model"
	loadParamsVal(model_path, params)
	print "Compiling computing graph."
	output_model = theano.function(
 		[corpus, sentenceWordCount],
 		[layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell
 	)
	
	print "Compiled."
	input_filename = "data/" + dataset_name + "/train/small_text"
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename)
	count = 0
	while(count < cr.getDocNum()):
		info = cr.getCorpus([count, count + 1])
		count += 1
		if info is None:
			print "Pass"
			continue
		docMatrixes, _, sentenceWordNums, ids, sentences, _ = info
		docMatrixes = numpy.matrix(
		            docMatrixes,
		            dtype=theano.config.floatX
		        )
		sentenceWordNums = numpy.array(
		            sentenceWordNums,
		            dtype=numpy.int32
		        )
		print "start to predict: %s." % ids[0]
		info = output_model(docMatrixes, sentenceWordNums)
		pred_y = info[0]
		g = info[1]
		word_scores = info[2]
		cell_outputs = info[3]
		cell_scores = info[4]
		word_scores_against_cell = info[5:]
		
		if len(word_scores_against_cell) != len(cell_outputs):
			print "The dimension of word_socre and word are different."
			raise Exception("The dimension of word_socre and word are different.")
		print "End predicting."
		
		print "Writing resfile."

		score_sentence_list = zip(g, sentences)
		score_sentence_list.sort(key=lambda x:-x[0])
		
		current_doc_dir = "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0]
		if not os.path.exists(current_doc_dir):
			os.makedirs(current_doc_dir)
		# sentence sentence_score
		with codecs.open(current_doc_dir + "/sentence_score", "w", 'utf-8', "ignore") as f:
			f .write("pred_y: %i\n" % pred_y[0])
			for g0, s in score_sentence_list:
				f.write("%f\t%s\n" % (g0, string.join(s, " ")))
	
		wordList = list()
		for s in sentences:
			wordList.extend(s)
		print "length of word_scores", len(word_scores)
		print "length of wordList", len(wordList)
		score_word_list = zip(wordList , word_scores)
		with codecs.open(current_doc_dir + "/nn_word", "w", 'utf-8', "ignore") as f:
			for word, word_score in score_word_list:
				f.write("%s\t%f\n" % (word, word_score))
		
		with codecs.open(current_doc_dir + "/nn_word_merged", "w", 'utf-8', "ignore") as f:
			merged_score_word_list = merge_kv(score_word_list)
			for word, word_score in merged_score_word_list:
				f.write("%s\t%f\n" % (word, word_score))
		
		if not os.path.exists(current_doc_dir + "/nc_word"):
			os.makedirs(current_doc_dir + "/nc_word")
		neu_num = 0
		
		for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores):
			with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", 'utf-8', "ignore") as f:
				f.write("cell sentence_score: %lf\n" % c_output)
				for word, word_score in zip(wordList, w):
					f.write("%s\t%f\n" % (word, word_score))
			merged_score_word_list = merge_kv(zip(wordList, w))
			with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", 'utf-8', "ignore") as f:
				f.write("cell_scores: %lf\n" % c_score)
				f.write("cell_output: %lf\n" % c_output)
				for word, word_score in merged_score_word_list:
					f.write("%s\t%f\n" % (word, word_score))
			neu_num += 1
		print "Written." + str(count)
		
	print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode):
	print "mode: ", mode
	print "data_name: ", data_name
	print "Started!"
	
	data_names = data_name.split(":")
	data_count = len(data_names)
	print "Train dataset:"
	for i in xrange(data_count):
		print "%d: %s" % (i, data_names[i])
		
	print "Test dataset:"
	test_data_names = test_dataname.split(":")
	test_data_count = len(test_data_names)
	for i in xrange(test_data_count):
		print "%d: %s" % (i, test_data_names[i])
	
	if test_data_count != data_count:
		raise Exception("The amount of test and train dataset must be the same.")
	
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel')
	
	hidden_layer_w = None
	hidden_layer_b = None
	logistic_layer_w = None
	logistic_layer_b = None
	layer0 = list()
	layer1 = list()
	layer2 = list()
	local_params = list()
	# for list-type data
	for i in xrange(data_count):
		layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
														 sentenceLayerNodesNum=100, \
														 sentenceLayerNodesSize=[5, 200], \
														 docLayerNodesNum=100, \
														 docLayerNodesSize=[3, 100]))

		layer1.append(HiddenLayer(
			rng,
			input=layer0[i].output,
			n_in=layer0[i].outputDimension,
			n_out=100,
			activation=T.tanh,
			W=hidden_layer_w,
			b=hidden_layer_b
		))
		
		hidden_layer_w = layer1[i].W
		hidden_layer_b = layer1[i].b
	
		layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b))
		logistic_layer_w = layer2[i].W
		logistic_layer_b = layer2[i].b
		
		local_params.append(layer0[i].params)
	
	share_params = layer2[0].params + layer1[0].params
	# construct the parameter array.
	params = layer2[0].params + layer1[0].params
	
	for i in xrange(data_count):
		params += layer0[i].params
		
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/model/scnn.model"
	traintext = ["data/" + data_names[i] + "/train/text"  for i in xrange(data_count)]
	trainlabel = ["data/" + data_names[i] + "/train/label"  for i in xrange(data_count)]
	testtext = ["data/" + test_data_names[i] + "/test/text"  for i in xrange(data_count)]
	testlabel = ["data/" + test_data_names[i] + "/test/label"  for i in xrange(data_count)]
	
	# Load the parameters last time, optionally.
	loadParamsVal(para_path, params)

	if(mode == "train"):
		train_model = list()
		valid_model = list()
		print "Loading train data."
		batchSize = 10
		share_learning_rate = 0.01
		local_learning_rate = 0.1
		n_batches = list()
		
		for i in xrange(data_count):
			cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i])
			docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000])
			
			docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
			docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
			sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
			labels = transToTensor(labels, numpy.int32)
			
			index = T.lscalar("index")
			
			n_batches.append((len(docSentenceNums.get_value())  - 1 - 1) / batchSize + 1)
			print "Dataname: %s" % data_names[i]
			print "Train set size is ", len(docMatrixes.get_value())
			print "Batch size is ", batchSize
			print "Number of training batches  is ", n_batches[i]
			error = layer2[i].errors(docLabel)
			cost = layer2[i].negative_log_likelihood(docLabel)
			
		
			share_grads = T.grad(cost, share_params)
			share_updates = [
				(param_i, param_i - share_learning_rate * grad_i)
				for param_i, grad_i in zip(share_params, share_grads)
			]
			
			grads = T.grad(cost, local_params[i])
			local_updates =  [
				(param_i, param_i - local_learning_rate * grad_i)
				for param_i, grad_i in zip(local_params[i], grads)
			]
			updates = share_updates + local_updates
			print "Compiling train computing graph."
			
			train_model.append(theano.function(
		 		[index],
		 		[cost, error, layer2[i].y_pred, docLabel],
		 		updates=updates,
		 		givens={
								corpus: docMatrixes,
								docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
								sentenceWordCount: sentenceWordNums,
								docLabel: labels[index * batchSize: (index + 1) * batchSize]
							}
	 		))
			print "Compiled."

		print "Start to train."
		epoch = 0
		n_epochs = 10
		ite = 0
		
		# ####Validate the model####
		for dataset_index in xrange(data_count):
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model[dataset_index]()
			print "Valid current model :", data_names[dataset_index]
			print "Cost: ", costNum
			print "Error: ", errorNum
	 		
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "ROC: ", roc_auc
			
	print "All finished!"
def work(model_name, dataset_name, pooling_mode):
    print "model_name: ", model_name
    print "dataset_name: ", dataset_name
    print "pooling_mode: ", pooling_mode
    print "Started!"
    rng = numpy.random.RandomState(23455)
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    # 	docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNNOneDoc(
        corpus,
        sentenceWordCount,
        rng,
        wordEmbeddingDim=200,
        sentenceLayerNodesNum=100,
        sentenceLayerNodesSize=[5, 200],
        docLayerNodesNum=100,
        docLayerNodesSize=[3, 100],
        pooling_mode=pooling_mode,
    )

    layer1_output_num = 100
    layer1 = HiddenLayer(
        rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh
    )

    layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

    cost = layer2.negative_log_likelihood(1 - layer2.y_pred)

    # calculate sentence sentence_score
    sentence_grads = T.grad(cost, layer0.sentenceResults)
    sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults)))

    # calculate word sentence_score against the whole network
    word_grad = T.grad(cost, corpus)
    word_score = T.diag(T.dot(word_grad, T.transpose(corpus)))

    # calculate word
    cell_scores = T.grad(cost, layer1.output)

    # calculate word score against cells
    word_score_against_cell = [
        T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num)
    ]

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.
    model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model"
    loadParamsVal(model_path, params)
    print "Compiling computing graph."
    output_model = theano.function(
        [corpus, sentenceWordCount],
        [layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell,
    )

    print "Compiled."
    input_filename = "data/" + dataset_name + "/train/small_text"
    cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename)
    count = 0
    while count < cr.getDocNum():
        info = cr.getCorpus([count, count + 1])
        count += 1
        if info is None:
            print "Pass"
            continue
        docMatrixes, _, sentenceWordNums, ids, sentences, _ = info
        docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
        sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
        print "start to predict: %s." % ids[0]
        info = output_model(docMatrixes, sentenceWordNums)
        pred_y = info[0]
        g = info[1]
        word_scores = info[2]
        cell_outputs = info[3]
        cell_scores = info[4]
        word_scores_against_cell = info[5:]

        if len(word_scores_against_cell) != len(cell_outputs):
            print "The dimension of word_socre and word are different."
            raise Exception("The dimension of word_socre and word are different.")
        print "End predicting."

        print "Writing resfile."

        score_sentence_list = zip(g, sentences)
        score_sentence_list.sort(key=lambda x: -x[0])

        current_doc_dir = (
            "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0]
        )
        if not os.path.exists(current_doc_dir):
            os.makedirs(current_doc_dir)
            # sentence sentence_score
        with codecs.open(current_doc_dir + "/sentence_score", "w", "utf-8", "ignore") as f:
            f.write("pred_y: %i\n" % pred_y[0])
            for g0, s in score_sentence_list:
                f.write("%f\t%s\n" % (g0, string.join(s, " ")))

        wordList = list()
        for s in sentences:
            wordList.extend(s)
        print "length of word_scores", len(word_scores)
        print "length of wordList", len(wordList)
        score_word_list = zip(wordList, word_scores)
        with codecs.open(current_doc_dir + "/nn_word", "w", "utf-8", "ignore") as f:
            for word, word_score in score_word_list:
                f.write("%s\t%f\n" % (word, word_score))

        with codecs.open(current_doc_dir + "/nn_word_merged", "w", "utf-8", "ignore") as f:
            merged_score_word_list = merge_kv(score_word_list)
            for word, word_score in merged_score_word_list:
                f.write("%s\t%f\n" % (word, word_score))

        if not os.path.exists(current_doc_dir + "/nc_word"):
            os.makedirs(current_doc_dir + "/nc_word")
        neu_num = 0

        for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores):
            with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", "utf-8", "ignore") as f:
                f.write("cell sentence_score: %lf\n" % c_output)
                for word, word_score in zip(wordList, w):
                    f.write("%s\t%f\n" % (word, word_score))
            merged_score_word_list = merge_kv(zip(wordList, w))
            with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", "utf-8", "ignore") as f:
                f.write("cell_scores: %lf\n" % c_score)
                f.write("cell_output: %lf\n" % c_output)
                for word, word_score in merged_score_word_list:
                    f.write("%s\t%f\n" % (word, word_score))
            neu_num += 1
        print "Written." + str(count)

    print "All finished!"
def work(mode, data_name, test_dataname):
	print "mode: ", mode
	print "data_name: ", data_name
	print "Started!"
	
	data_names = data_name.split(":")
	data_count = len(data_names)
	print "Train dataset:"
	for i in xrange(data_count):
		print "%d: %s" % (i, data_names[i])
		
	print "Test dataset:"
	test_data_names = test_dataname.split(":")
	test_data_count = len(test_data_names)
	for i in xrange(test_data_count):
		print "%d: %s" % (i, test_data_names[i])
	
	if test_data_count != data_count:
		raise Exception("The amount of test and train dataset must be the same.")
	
	rng = numpy.random.RandomState(23455)
	docSentenceCount = T.ivector("docSentenceCount")
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
	docLabel = T.ivector('docLabel')
	
	hidden_layer_w = None
	hidden_layer_b = None
	logistic_layer_w = None
	logistic_layer_b = None
	layer0 = list()
	layer1 = list()
	layer2 = list()
	local_params = list()
	# for list-type data
	for i in xrange(data_count):
		layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
														 sentenceLayerNodesNum=100, \
														 sentenceLayerNodesSize=[5, 200], \
														 docLayerNodesNum=100, \
														 docLayerNodesSize=[3, 100]))

		layer1.append(HiddenLayer(
			rng,
			input=layer0[i].output,
			n_in=layer0[i].outputDimension,
			n_out=100,
			activation=T.tanh,
			W=hidden_layer_w,
			b=hidden_layer_b
		))
		
		hidden_layer_w = layer1[i].W
		hidden_layer_b = layer1[i].b
	
		layer2.append(LogisticRegression(input=layer1[i].output, n_in=100, n_out=2, W=logistic_layer_w, b=logistic_layer_b))
		logistic_layer_w = layer2[i].W
		logistic_layer_b = layer2[i].b
		
		local_params.append(layer2[i].params + layer1[i].params + layer0[i].params)

	# construct the parameter array.
	params = layer2[0].params + layer1[0].params
	
	for i in xrange(data_count):
		params += layer0[i].params
		
	# Load the parameters last time, optionally.
	
# 	data_name = "car"
	
	para_path = "data/" + data_name + "/model/scnn.model"
	traintext = ["data/" + data_names[i] + "/train/text"  for i in xrange(data_count)]
	trainlabel = ["data/" + data_names[i] + "/train/label"  for i in xrange(data_count)]
	testtext = ["data/" + test_data_names[i] + "/test/text"  for i in xrange(data_count)]
	testlabel =  ["data/" + test_data_names[i] + "/test/label"  for i in xrange(data_count)]
	
	loadParamsVal(para_path, params)

	if(mode == "train"):
		train_model = list()
		valid_model = list()
		print "Loading train data."
		batchSize = 10
		learning_rate = 0.1
		n_batches = list()
		
		print "Loading test data."
 		
		for i in xrange(data_count):
			cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i])
			docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus([0, 100000])
			
			docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
			docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
			sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
			labels = transToTensor(labels, numpy.int32)
			
			index = T.lscalar("index")
			
			n_batches.append((len(docSentenceNums.get_value()) - 1) / batchSize + 1)
			print "Dataname: %s" % data_names[i]
			print "Train set size is ", len(docMatrixes.get_value())
			print "Batch size is ", batchSize
			print "Number of training batches  is ", n_batches[i]
			error = layer2[i].errors(docLabel)
			cost = layer2[i].negative_log_likelihood(docLabel)
			
			grads = T.grad(cost, local_params[i])
		
			updates = [
				(param_i, param_i - learning_rate * grad_i)
				for param_i, grad_i in zip(local_params[i], grads)
			]
			print "Compiling train computing graph."
			
			train_model.append(theano.function(
		 		[index],
		 		[cost, error, layer2[i].y_pred, docLabel],
		 		updates=updates,
		 		givens={
								corpus: docMatrixes,
								docSentenceCount: docSentenceNums[index * batchSize: (index + 1) * batchSize + 1],
								sentenceWordCount: sentenceWordNums,
								docLabel: labels[index * batchSize: (index + 1) * batchSize]
							}
	 		))
			print "Compiled."
			
			print "Load test dataname: %s" % test_data_names[i]
			cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i])
			validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels = cr_test.getCorpus([0, 1000])
			validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX)
			validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
			validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32)
			validLabels = transToTensor(validLabels, numpy.int32)
			print "Validating set size is ", len(validDocMatrixes.get_value())
			print "Data loaded."
			
			print "Compiling test computing graph."
			valid_model.append(theano.function(
		 		[],
		 		[cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1]],
		 		givens={
								corpus: validDocMatrixes,
								docSentenceCount: validDocSentenceNums,
								sentenceWordCount: validSentenceWordNums,
								docLabel: validLabels
						}
		 	))
			print "Compiled."
		# for list-type data

		print "Start to train."
		epoch = 0
		n_epochs = 2000
		ite = 0
		
		# ####Validate the model####
		for dataset_index in xrange(data_count):
			costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]()
			print "Valid current model :", data_names[dataset_index]
			print "Cost: ", costNum
			print "Error: ", errorNum
# 			print "Valid Pred: ", pred_label
# 			print "pred_prob: ", pred_prob
	 		
			fpr, tpr, _ = roc_curve(real_label, pred_prob)
			roc_auc = auc(fpr, tpr)
			print "data_name: ", data_name
			print "test_dataname: ", test_dataname
			print "ROC: ", roc_auc
			
		while (epoch < n_epochs):
			epoch = epoch + 1
			#######################
			for i in range(max(n_batches)):
				for dataset_index in xrange(data_count):
					if i >= n_batches[dataset_index]:
						continue
					# for list-type data
					costNum, errorNum, pred_label, real_label = train_model[dataset_index](i)
					ite = ite + 1
					# for padding data
		# 			costNum, errorNum = train_model(docMatrixes, labels)
		# 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
					# print ".", 
					if(ite % 10 == 0):
						print
						print "Dataset name: ", data_names[dataset_index]
						print "@iter: ", ite
						print "Cost: ", costNum
						print "Error: ", errorNum
						
			# Validate the model
			for dataset_index in xrange(data_count):
				costNum, errorNum, pred_label, real_label, pred_prob = valid_model[i]()
				print "Valid current model :", data_names[dataset_index]
				print "Cost: ", costNum
				print "Error: ", errorNum
	# 			print "Valid Pred: ", pred_label
	# 			print "pred_prob: ", pred_prob
		 		
				fpr, tpr, _ = roc_curve(real_label, pred_prob)
				roc_auc = auc(fpr, tpr)
				print "data_name: ", data_name
				print "test_dataname: ", test_dataname
				print "ROC: ", roc_auc
		
				# Save model
				print "Saving parameters."
				saveParamsVal(para_path, params)
				print "Saved."
# 	elif(mode == "deploy"):
# 		print "Compiling computing graph."
# 		output_model = theano.function(
# 	 		[corpus, docSentenceCount, sentenceWordCount],
# 	 		[layer2.y_pred]
# 	 	)
# 		print "Compiled."
# 		cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split")
# 		count = 21000
# 		while(count <= 21000):
# 			docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus([count, count + 100])
# 			docMatrixes = numpy.matrix(
# 			            docMatrixes,
# 			            dtype=theano.config.floatX
# 			        )
# 			docSentenceNums = numpy.array(
# 			            docSentenceNums,
# 			            dtype=numpy.int32
# 			        )
# 			sentenceWordNums = numpy.array(
# 			            sentenceWordNums,
# 			            dtype=numpy.int32
# 			        )
# 			print "start to predict."
# 			pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums)
# 			print "End predicting."
# 			print "Writing resfile."
# 	# 		print zip(ids, pred_y[0])
# 			f = file("data/test/res/res" + str(count), "w")
# 			f.write(str(zip(ids, pred_y[0])))
# 			f.close()
# 			print "Written." + str(count)
# 			count += 100
		
		
	print "All finished!"
def work(mode, data_name, test_dataname, pooling_mode):
    print "mode: ", mode
    print "data_name: ", data_name
    print "Started!"

    data_names = data_name.split(":")
    data_count = len(data_names)
    print "Train dataset:"
    for i in xrange(data_count):
        print "%d: %s" % (i, data_names[i])

    print "Test dataset:"
    test_data_names = test_dataname.split(":")
    test_data_count = len(test_data_names)
    for i in xrange(test_data_count):
        print "%d: %s" % (i, test_data_names[i])

    if test_data_count != data_count:
        raise Exception(
            "The amount of test and train dataset must be the same.")

    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    docLabel = T.ivector('docLabel')

    hidden_layer_w = None
    hidden_layer_b = None
    logistic_layer_w = None
    logistic_layer_b = None
    layer0 = list()
    layer1 = list()
    layer2 = list()
    local_params = list()
    # for list-type data
    for i in xrange(data_count):
        layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=200, \
                     sentenceLayerNodesNum=100, \
                     sentenceLayerNodesSize=[5, 200], \
                     docLayerNodesNum=100, \
                     docLayerNodesSize=[3, 100]))

        layer1.append(
            HiddenLayer(rng,
                        input=layer0[i].output,
                        n_in=layer0[i].outputDimension,
                        n_out=100,
                        activation=T.tanh,
                        W=hidden_layer_w,
                        b=hidden_layer_b))

        hidden_layer_w = layer1[i].W
        hidden_layer_b = layer1[i].b

        layer2.append(
            LogisticRegression(input=layer1[i].output,
                               n_in=100,
                               n_out=2,
                               W=logistic_layer_w,
                               b=logistic_layer_b))
        logistic_layer_w = layer2[i].W
        logistic_layer_b = layer2[i].b

        local_params.append(layer0[i].params)

    share_params = layer2[0].params + layer1[0].params
    # construct the parameter array.
    params = layer2[0].params + layer1[0].params

    for i in xrange(data_count):
        params += layer0[i].params


# 	data_name = "car"

    para_path = "data/" + data_name + "/model/scnn.model"
    traintext = [
        "data/" + data_names[i] + "/train/text" for i in xrange(data_count)
    ]
    trainlabel = [
        "data/" + data_names[i] + "/train/label" for i in xrange(data_count)
    ]
    testtext = [
        "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count)
    ]
    testlabel = [
        "data/" + test_data_names[i] + "/test/label"
        for i in xrange(data_count)
    ]

    # Load the parameters last time, optionally.
    loadParamsVal(para_path, params)

    if (mode == "train"):
        train_model = list()
        valid_model = list()
        print "Loading train data."
        batchSize = 10
        share_learning_rate = 0.01
        local_learning_rate = 0.1
        n_batches = list()

        for i in xrange(data_count):
            cr_train = CorpusReader(minDocSentenceNum=5,
                                    minSentenceWordNum=5,
                                    dataset=traintext[i],
                                    labelset=trainlabel[i])
            docMatrixes, docSentenceNums, sentenceWordNums, ids, labels = cr_train.getCorpus(
                [0, 100000])

            docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
            docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
            sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
            labels = transToTensor(labels, numpy.int32)

            index = T.lscalar("index")

            n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) /
                             batchSize + 1)
            print "Dataname: %s" % data_names[i]
            print "Train set size is ", len(docMatrixes.get_value())
            print "Batch size is ", batchSize
            print "Number of training batches  is ", n_batches[i]
            error = layer2[i].errors(docLabel)
            cost = layer2[i].negative_log_likelihood(docLabel)

            share_grads = T.grad(cost, share_params)
            share_updates = [
                (param_i, param_i - share_learning_rate * grad_i)
                for param_i, grad_i in zip(share_params, share_grads)
            ]

            grads = T.grad(cost, local_params[i])
            local_updates = [
                (param_i, param_i - local_learning_rate * grad_i)
                for param_i, grad_i in zip(local_params[i], grads)
            ]
            updates = share_updates + local_updates
            print "Compiling train computing graph."

            train_model.append(
                theano.function(
                    [index], [cost, error, layer2[i].y_pred, docLabel],
                    updates=updates,
                    givens={
                        corpus:
                        docMatrixes,
                        docSentenceCount:
                        docSentenceNums[index *
                                        batchSize:(index + 1) * batchSize + 1],
                        sentenceWordCount:
                        sentenceWordNums,
                        docLabel:
                        labels[index * batchSize:(index + 1) * batchSize]
                    }))
            print "Compiled."

        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        # ####Validate the model####
        for dataset_index in xrange(data_count):
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model[
                dataset_index]()
            print "Valid current model :", data_names[dataset_index]
            print "Cost: ", costNum
            print "Error: ", errorNum

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "ROC: ", roc_auc

    print "All finished!"