Python CorpusReader.getDocNum примеры использования

Язык программирования: Python

Пространство имен/Пакет: knoweagebleClassifyFlattenedLazy

Класс/Тип: CorpusReader

Метод/Функция: getDocNum

Примеров на hotexamples.com: 2

Python CorpusReader.getDocNum - 2 примера найдено. Это лучшие примеры Python кода для knoweagebleClassifyFlattenedLazy.CorpusReader.getDocNum, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CorpusReader(5)

getCorpus(5)

getDocNum(1)

Основные методы

CorpusReader (5)

getCorpus (5)

getDocNum (1)

Пример #1

Показать файл

Файл: structureTestOnMonster2OneDocTest.py Проект: shockline/KnowlegeableCNN

def work(model_name, dataset_name, pooling_mode):
	print "model_name: ", model_name
	print "dataset_name: ", dataset_name
	print "pooling_mode: ", pooling_mode
	print "Started!"
	rng = numpy.random.RandomState(23455)
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
# 	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100],
													 pooling_mode=pooling_mode)

	layer1_output_num = 100
	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=layer1_output_num,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	cost = layer2.negative_log_likelihood(1 - layer2.y_pred)
		
	# calculate sentence sentence_score
	sentence_grads = T.grad(cost, layer0.sentenceResults)
	sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults)))
	
	# calculate word sentence_score against the whole network
	word_grad = T.grad(cost, corpus)
	word_score = T.diag(T.dot(word_grad, T.transpose(corpus)))
	
	# calculate word
	cell_scores = T.grad(cost, layer1.output)
	
	# calculate word score against cells
	word_score_against_cell = [T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num)]

	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model"
	loadParamsVal(model_path, params)
	print "Compiling computing graph."
	output_model = theano.function(
 		[corpus, sentenceWordCount],
 		[layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell
 	)
	
	print "Compiled."
	input_filename = "data/" + dataset_name + "/train/small_text"
	cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename)
	count = 0
	while(count < cr.getDocNum()):
		info = cr.getCorpus([count, count + 1])
		count += 1
		if info is None:
			print "Pass"
			continue
		docMatrixes, _, sentenceWordNums, ids, sentences, _ = info
		docMatrixes = numpy.matrix(
		            docMatrixes,
		            dtype=theano.config.floatX
		        )
		sentenceWordNums = numpy.array(
		            sentenceWordNums,
		            dtype=numpy.int32
		        )
		print "start to predict: %s." % ids[0]
		info = output_model(docMatrixes, sentenceWordNums)
		pred_y = info[0]
		g = info[1]
		word_scores = info[2]
		cell_outputs = info[3]
		cell_scores = info[4]
		word_scores_against_cell = info[5:]
		
		if len(word_scores_against_cell) != len(cell_outputs):
			print "The dimension of word_socre and word are different."
			raise Exception("The dimension of word_socre and word are different.")
		print "End predicting."
		
		print "Writing resfile."

		score_sentence_list = zip(g, sentences)
		score_sentence_list.sort(key=lambda x:-x[0])
		
		current_doc_dir = "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0]
		if not os.path.exists(current_doc_dir):
			os.makedirs(current_doc_dir)
		# sentence sentence_score
		with codecs.open(current_doc_dir + "/sentence_score", "w", 'utf-8', "ignore") as f:
			f .write("pred_y: %i\n" % pred_y[0])
			for g0, s in score_sentence_list:
				f.write("%f\t%s\n" % (g0, string.join(s, " ")))
	
		wordList = list()
		for s in sentences:
			wordList.extend(s)
		print "length of word_scores", len(word_scores)
		print "length of wordList", len(wordList)
		score_word_list = zip(wordList , word_scores)
		with codecs.open(current_doc_dir + "/nn_word", "w", 'utf-8', "ignore") as f:
			for word, word_score in score_word_list:
				f.write("%s\t%f\n" % (word, word_score))
		
		with codecs.open(current_doc_dir + "/nn_word_merged", "w", 'utf-8', "ignore") as f:
			merged_score_word_list = merge_kv(score_word_list)
			for word, word_score in merged_score_word_list:
				f.write("%s\t%f\n" % (word, word_score))
		
		if not os.path.exists(current_doc_dir + "/nc_word"):
			os.makedirs(current_doc_dir + "/nc_word")
		neu_num = 0
		
		for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores):
			with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", 'utf-8', "ignore") as f:
				f.write("cell sentence_score: %lf\n" % c_output)
				for word, word_score in zip(wordList, w):
					f.write("%s\t%f\n" % (word, word_score))
			merged_score_word_list = merge_kv(zip(wordList, w))
			with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", 'utf-8', "ignore") as f:
				f.write("cell_scores: %lf\n" % c_score)
				f.write("cell_output: %lf\n" % c_output)
				for word, word_score in merged_score_word_list:
					f.write("%s\t%f\n" % (word, word_score))
			neu_num += 1
		print "Written." + str(count)
		
	print "All finished!"

Пример #2

Показать файл

Файл: structureTestOnMonster2OneDocTest.py Проект: shockline/KnowlegeableCNN

def work(model_name, dataset_name, pooling_mode):
    print "model_name: ", model_name
    print "dataset_name: ", dataset_name
    print "pooling_mode: ", pooling_mode
    print "Started!"
    rng = numpy.random.RandomState(23455)
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    # 	docLabel = T.ivector('docLabel')

    # for list-type data
    layer0 = DocEmbeddingNNOneDoc(
        corpus,
        sentenceWordCount,
        rng,
        wordEmbeddingDim=200,
        sentenceLayerNodesNum=100,
        sentenceLayerNodesSize=[5, 200],
        docLayerNodesNum=100,
        docLayerNodesSize=[3, 100],
        pooling_mode=pooling_mode,
    )

    layer1_output_num = 100
    layer1 = HiddenLayer(
        rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer1_output_num, activation=T.tanh
    )

    layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

    cost = layer2.negative_log_likelihood(1 - layer2.y_pred)

    # calculate sentence sentence_score
    sentence_grads = T.grad(cost, layer0.sentenceResults)
    sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults)))

    # calculate word sentence_score against the whole network
    word_grad = T.grad(cost, corpus)
    word_score = T.diag(T.dot(word_grad, T.transpose(corpus)))

    # calculate word
    cell_scores = T.grad(cost, layer1.output)

    # calculate word score against cells
    word_score_against_cell = [
        T.diag(T.dot(T.grad(layer1.output[i], corpus), T.transpose(corpus))) for i in xrange(layer1_output_num)
    ]

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.
    model_path = "data/" + dataset_name + "/model_100,100,100,100,parameters/" + pooling_mode + ".model"
    loadParamsVal(model_path, params)
    print "Compiling computing graph."
    output_model = theano.function(
        [corpus, sentenceWordCount],
        [layer2.y_pred, sentence_score, word_score, layer1.output, cell_scores] + word_score_against_cell,
    )

    print "Compiled."
    input_filename = "data/" + dataset_name + "/train/small_text"
    cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=input_filename)
    count = 0
    while count < cr.getDocNum():
        info = cr.getCorpus([count, count + 1])
        count += 1
        if info is None:
            print "Pass"
            continue
        docMatrixes, _, sentenceWordNums, ids, sentences, _ = info
        docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
        sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
        print "start to predict: %s." % ids[0]
        info = output_model(docMatrixes, sentenceWordNums)
        pred_y = info[0]
        g = info[1]
        word_scores = info[2]
        cell_outputs = info[3]
        cell_scores = info[4]
        word_scores_against_cell = info[5:]

        if len(word_scores_against_cell) != len(cell_outputs):
            print "The dimension of word_socre and word are different."
            raise Exception("The dimension of word_socre and word are different.")
        print "End predicting."

        print "Writing resfile."

        score_sentence_list = zip(g, sentences)
        score_sentence_list.sort(key=lambda x: -x[0])

        current_doc_dir = (
            "data/output/" + model_name + "/" + pooling_mode + "/" + dataset_name + "/" + str(pred_y[0]) + "/" + ids[0]
        )
        if not os.path.exists(current_doc_dir):
            os.makedirs(current_doc_dir)
            # sentence sentence_score
        with codecs.open(current_doc_dir + "/sentence_score", "w", "utf-8", "ignore") as f:
            f.write("pred_y: %i\n" % pred_y[0])
            for g0, s in score_sentence_list:
                f.write("%f\t%s\n" % (g0, string.join(s, " ")))

        wordList = list()
        for s in sentences:
            wordList.extend(s)
        print "length of word_scores", len(word_scores)
        print "length of wordList", len(wordList)
        score_word_list = zip(wordList, word_scores)
        with codecs.open(current_doc_dir + "/nn_word", "w", "utf-8", "ignore") as f:
            for word, word_score in score_word_list:
                f.write("%s\t%f\n" % (word, word_score))

        with codecs.open(current_doc_dir + "/nn_word_merged", "w", "utf-8", "ignore") as f:
            merged_score_word_list = merge_kv(score_word_list)
            for word, word_score in merged_score_word_list:
                f.write("%s\t%f\n" % (word, word_score))

        if not os.path.exists(current_doc_dir + "/nc_word"):
            os.makedirs(current_doc_dir + "/nc_word")
        neu_num = 0

        for w, c_output, c_score in zip(word_scores_against_cell, cell_outputs, cell_scores):
            with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num), "w", "utf-8", "ignore") as f:
                f.write("cell sentence_score: %lf\n" % c_output)
                for word, word_score in zip(wordList, w):
                    f.write("%s\t%f\n" % (word, word_score))
            merged_score_word_list = merge_kv(zip(wordList, w))
            with codecs.open(current_doc_dir + "/nc_word/" + str(neu_num) + "_merged", "w", "utf-8", "ignore") as f:
                f.write("cell_scores: %lf\n" % c_score)
                f.write("cell_output: %lf\n" % c_output)
                for word, word_score in merged_score_word_list:
                    f.write("%s\t%f\n" % (word, word_score))
            neu_num += 1
        print "Written." + str(count)

    print "All finished!"