Exemplo n.º 1
0
def main():
	sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	for sen in test_sentences_doc:
		sen_iterms = sen.strip().split("\t")
		if len(sen_iterms) >= 2:
			test_sentences.append(sen_iterms[1])

	# train_sentences = [
	# 	'0无偿居间介绍买卖毒品的行为应如何定性',
	# 	'1吸毒男动态持有大量毒品的行为该如何认定',
	# 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
	# 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
	# 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
	# 	'5为获报酬帮人购买毒品的行为该如何认定',
	# 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
	# 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
	# 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
	# 	'9一方未签字办理的结婚登记是否有效',
	# 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
	# 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
	# 	'12身份证被别人冒用无法登记结婚怎么办?',
	# 	'13同居后又与他人登记结婚是否构成重婚罪',
	# 	'14未办登记只举办结婚仪式可起诉离婚吗',
	# 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
	# ]
	# print type(train_sentences[0])
	# print len(sentences)
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ",""))
	print type(train_sentences[0])
	#
	# print "build simi_model"
	SSO = SentenceSimilarlyObj()

	# corpus = SSO.getCorpus(train_sentences)
	# SSO.setSimilar(corpus=corpus)
	# print "save simi model"
	# SSO.save()
	# SSO.save("simi_model_little","word_dic_little")
	# print "build success"

	print "load model"
	SSO.load()
	# print SSO.similar

	print "test"
	# indexs = SSO.calSentenceSimilarly(sentence=u"说说后天是礼拜几")
	# for index in indexs:
	# 	print index[0],train_sentences[index[0]],index[1]
	result = SSO.calSentencesSimilarly(train_sentences,train_sentences)
	Wr = WriteResult()
	can_not_deal = Wr.WriteSimilarlySentence(result,"docSim_simi.txt")
Exemplo n.º 2
0
def main():
    sentences = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                     "AllQueriesWithID.txt")
    test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                              "corpus_0829.txt")
    test_sentences = []
    for sen in test_sentences_doc:
        sen_iterms = sen.strip().split("\t")
        if len(sen_iterms) >= 2:
            test_sentences.append(sen_iterms[1])

    # train_sentences = [
    # 	'0无偿居间介绍买卖毒品的行为应如何定性',
    # 	'1吸毒男动态持有大量毒品的行为该如何认定',
    # 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
    # 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
    # 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
    # 	'5为获报酬帮人购买毒品的行为该如何认定',
    # 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
    # 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
    # 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
    # 	'9一方未签字办理的结婚登记是否有效',
    # 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
    # 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
    # 	'12身份证被别人冒用无法登记结婚怎么办?',
    # 	'13同居后又与他人登记结婚是否构成重婚罪',
    # 	'14未办登记只举办结婚仪式可起诉离婚吗',
    # 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
    # ]
    # print type(train_sentences[0])
    # print len(sentences)
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    #
    # print "build simi_model"
    SSO = SentenceSimilarlyObj()

    corpus = SSO.getCorpus(train_sentences)
    SSO.setSimilar(corpus=corpus)
    print "save simi model"
    # SSO.save()
    # SSO.save("simi_model_little","word_dic_little")
    # print "build success"

    print "load model"
    SSO.load()
    # print SSO.similar

    print "test"
Exemplo n.º 3
0
 def getNewWords(self):
     file = config.WordDicPath + "birds.txt"
     lines = ReadFile.readTXTFile(file)
     words = []
     for line in lines:
         words.extend(line.strip().split(" "))
     return words
Exemplo n.º 4
0
def MyTest():
	print "1"
	filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	# test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	# for sen in test_sentences_doc:
	# 	sen_iterms = sen.strip().split("\t")
	# 	if len(sen_iterms) >= 2:
	# 		test_sentences.append(sen_iterms[1])
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
	print type(train_sentences[0])
	test_sentences = train_sentences

	tsf = Ranker()
	tsf.load(config.SimilarlySentencePath+"AllQueriesWithID.txt")
	result= tsf.getSimilarSentences(test_sentences[:100])

	wr = WriteResult()
	wr.WriteSimilarSentence(result,file=config.SimilarlySentencePath+"rank_simi.txt")
Exemplo n.º 5
0
 def getNewWords(self):
     file = config.WordDicPath + "birds.txt"
     lines = ReadFile.readTXTFile(file)
     words = []
     for line in lines:
         words.extend(line.strip().split(" "))
     return words
Exemplo n.º 6
0
def MyTest():
    print "1"
    filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
    sentences = ReadFile.readTXTFile(filename)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
    test_sentences = []
    # for sen in test_sentences_doc:
    # 	sen_iterms = sen.strip().split("\t")
    # 	if len(sen_iterms) >= 2:
    # 		test_sentences.append(sen_iterms[1])
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    test_sentences = train_sentences

    tsf = Ranker()
    tsf.load(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    result = tsf.getSimilarSentences(test_sentences)

    wr = WriteResult()
    wr.WriteSimilarSentence(result,
                            file=config.SimilarlySentencePath +
                            "rank_simi.txt")
Exemplo n.º 7
0
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath):
    # files = ReadFile.getAllFilesInDir(dirPath)
    # sentences = ReadFile.getAllFilesContext(files,dirPath)
    #for test
    sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt')
    par_sentences = Participle.Participle(sentences[:10])
    par_filter_sentences = filterStopWords.filterStopWords(par_sentences)
    # return wordTostr(par_filter_sentences)
    return par_filter_sentences
Exemplo n.º 8
0
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath):
    # files = ReadFile.getAllFilesInDir(dirPath)
    # sentences = ReadFile.getAllFilesContext(files,dirPath)
    #for test
    sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt')
    par_sentences = Participle.Participle(sentences[:10])
    par_filter_sentences = filterStopWords.filterStopWords(par_sentences)
    # return wordTostr(par_filter_sentences)
    return par_filter_sentences
Exemplo n.º 9
0
def main():
	sentences = ReadFile.readTXTFile(config.TopicFilePath+"QRpair.txt")
	qaQueryPairTopic = QAQueryPairTopic()
	result = qaQueryPairTopic.getgetQAQueriesTopicId(sentences)
	wr = WriteResult()
	wr.WriteTopicRegular(result[0])
	wr.WriteTopic(result[1])
	result = qaQueryPairTopic.getResponsesTopic(sentences)
	wr.WriteResponseWithTopicId(result)
Exemplo n.º 10
0
def insertDicItem():
    file = MyCode.config.CaiCaiPath + 'AllQueriesWithID_mid2.txt'
    fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt'
    sentences = ReadFile.readTXTFile(file)
    with open(fileEnd,'a+') as fp:
        for sen in sentences:
            lines = sen.split("\t")
            lines[2] = lines[2][:-3]+', "client_id": "c_00000007"}'
            # print lines[2]
            fp.write(lines[0] +"\t"+lines[1]+'\t'+lines[2]+'\n')
Exemplo n.º 11
0
def Topic2Vec_v2():
    """
	分析句子在,将句子转换为topic 向量
	:return:
	"""
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData +
                                     "topic_data_processed.txt")
    docs = []
    lab = []
    for index, line in enumerate(sentences):
        term = line.strip().split("\t")
        if len(term) != 3:
            continue

        docs.append(term[1])
        lab.append(term[2])
    documents = line_Cut_Word(docs)
    documents = [" ".join(doc) for doc in documents]

    lda.load_word_dic()
    lda.load_LdaModel()
    # lda.build_word_dic(lines(documents))
    # print len(lda.word_dic.keys())
    # lda.buildModel(lines(documents))

    result_lab = []
    topic2vec = []
    x_index, y_index = [], []
    count = 0
    print len(lab)
    for index, doc_lab in enumerate(list(zip(docs, lab))):
        if index % 1000 == 0 and index != 0:
            print doc_lab[0], doc_lab[1]
            # break
        doc = doc_lab[0]
        la = doc_lab[1]
        topics = lda.getQuerySimilarly(doc)

        if topics:
            # print doc, "\t", la
            for topic in topics:
                x_index.append(count)
                y_index.append(topic[0])
                topic2vec.append(topic[1])
            count += 1
            result_lab.append(la)

    print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

    result = [x_index, y_index, topic2vec, result_lab]
    with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
        cPickle.dump(result, fp)
Exemplo n.º 12
0
def Topic2Vec_v2():
	"""
	分析句子在,将句子转换为topic 向量
	:return:
	"""
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt")
	docs = []
	lab = []
	for index, line in enumerate(sentences):
		term = line.strip().split("\t")
		if len(term) != 3:
			continue

		docs.append(term[1])
		lab.append(term[2])
	documents = line_Cut_Word(docs)
	documents = [" ".join(doc) for doc in documents]

	lda.load_word_dic()
	lda.load_LdaModel()
	# lda.build_word_dic(lines(documents))
	# print len(lda.word_dic.keys())
	# lda.buildModel(lines(documents))

	result_lab = []
	topic2vec = []
	x_index, y_index = [], []
	count = 0
	print len(lab)
	for index, doc_lab in enumerate(list(zip(docs, lab))):
		if index % 1000 == 0 and index != 0:
			print doc_lab[0], doc_lab[1]
			# break
		doc = doc_lab[0]
		la = doc_lab[1]
		topics = lda.getQuerySimilarly(doc)

		if topics:
			# print doc, "\t", la
			for topic in topics:
				x_index.append(count)
				y_index.append(topic[0])
				topic2vec.append(topic[1])
			count += 1
			result_lab.append(la)

	print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

	result = [x_index, y_index, topic2vec, result_lab]
	with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
		cPickle.dump(result, fp)
Exemplo n.º 13
0
 def TestTopic(self):
     Ta = TopicAnalysis()
     sentences = ReadFile.readTXTFile(config.TopicFilePath+"test_topic.txt")
     for text in sentences:
         # text = raw_input('query:\n')
         print "问 :"+text
         sentence = Sentence()
         sentence.text = text.strip()
         response = Ta.getResponse(sentence)
         if response:
             print "答 :"+response
         else:
             print "没有合适转移话题!"
Exemplo n.º 14
0
def buildModel(Name = "wordRank_filter"):
    file = MyCode.config.ModelPath + Name + '.model'
    model = None
    try:
        model = word2vec.Word2Vec.load(file)
    except :
        # word_sentences = getFileContext_Participle()
        # sentences = wordTostr(word_sentences)
        # sentences = TextIter()TextIter
        sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt')
        model = word2vec.Word2Vec(sentences[:10],min_count=1,workers=8)
        model.save(file)
    return model
Exemplo n.º 15
0
 def getWords(self):
     file = config.Semantic_dicPath + "semantic_wordgroup_new.txt"
     words = {}
     sentences = ReadFile.readTXTFile(file)
     for sen in sentences:
         items = sen.strip().split(' ')
         if len(items) < 2:
             continue
         if words.has_key(items[1]):
             words[items[1]].append(items[0])
         else:
             words[items[1]] = [items[0]]
     return words
Exemplo n.º 16
0
 def getWords(self):
     file = config.Semantic_dicPath+"semantic_wordgroup_new.txt"
     words = {}
     sentences = ReadFile.readTXTFile(file)
     for sen in sentences:
         items = sen.strip().split(' ')
         if len(items) < 2:
             continue
         if words.has_key(items[1]):
             words[items[1]].append(items[0])
         else:
             words[items[1]] = [items[0]]
     return words
Exemplo n.º 17
0
def filter(file=config.WordDicPath + "semantic_wordgroup_new.txt"):
    word_sentences = ReadFile.readTXTFile(file)
    word_dic = {}
    for word in word_sentences:
        iterms = word.strip().split(" ")
        if len(iterms) != 2:
            continue
        if iterms[0] in word_dic:
            if iterms[1] not in word_dic[iterms[0]]:
                word_dic[iterms[0]].append(iterms[1])
        else:
            word_dic[iterms[0]] = [iterms[1]]
    return word_dic
Exemplo n.º 18
0
 def TestTopic(self):
     Ta = TopicAnalysis()
     sentences = ReadFile.readTXTFile(config.TopicFilePath +
                                      "test_topic.txt")
     for text in sentences:
         # text = raw_input('query:\n')
         print "问 :" + text
         sentence = Sentence()
         sentence.text = text.strip()
         response = Ta.getResponse(sentence)
         if response:
             print "答 :" + response
         else:
             print "没有合适转移话题!"
Exemplo n.º 19
0
def buildModel(Name="wordRank_filter"):
    file = MyCode.config.ModelPath + Name + '.model'
    model = None
    try:
        model = word2vec.Word2Vec.load(file)
    except:
        # word_sentences = getFileContext_Participle()
        # sentences = wordTostr(word_sentences)
        # sentences = TextIter()TextIter
        sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath +
                                         'souhu_fenci.txt')
        model = word2vec.Word2Vec(sentences[:10], min_count=1, workers=8)
        model.save(file)
    return model
Exemplo n.º 20
0
def buildWordFromTxt():
    file = "../Result/sentence1.txt"
    sentences = ReadFile.readTXTFile(file)
    par_Sentences = []
    par_Sentences = Participle.Participle(sentences[10000:11000])
    w_Sentence = []
    with open("../Result/fenci.txt", 'w') as fp:
        for s in par_Sentences:
            p_sentence = ''
            for word in s:
                p_sentence += word + ' '
            w_Sentence.append(p_sentence)
        # print 'Start writing ... ...'
        # fp.writelines(w_Sentence)
        # print 'Finished writing !'
    return w_Sentence
Exemplo n.º 21
0
def main():
	model = Doc2VecObj()
	model.load()
	sc = SentencesClusters(20,model)
	filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))

	sc.getCluster(train_sentences[:100])
Exemplo n.º 22
0
def buildWordFromTxt():
    file = "../Result/sentence1.txt"
    sentences = ReadFile.readTXTFile(file)
    par_Sentences = []
    par_Sentences = Participle.Participle(sentences[10000:11000])
    w_Sentence = []
    with open("../Result/fenci.txt",'w') as fp:
        for s in par_Sentences:
            p_sentence = ''
            for word in s:
                p_sentence += word + ' '
            w_Sentence.append(p_sentence)
        # print 'Start writing ... ...'
        # fp.writelines(w_Sentence)
        # print 'Finished writing !'
    return w_Sentence
Exemplo n.º 23
0
def main():
    # 老版本
    # Lda_model = LdaTopic(modelName='LDAsimiWord_5')
    # # print Lda_model[0]
    # lda_model = Lda_model[0]
    # words = Lda_model[1]
    # n_top_words = 1
    # for i ,topic_dist in enumerate(lda_model.topic_word_):
    #     topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    #
    # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6')
    # # print Lda_model[0]
    # lda_model2 = Lda_model2[0]
    # words2 = Lda_model2[1]
    # n_top_words2 = 1
    # for i, topic_dist in enumerate(lda_model2.topic_word_):
    #     topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    # documents = getFileSentences()

    # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\
    #              u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\
    #              u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\
    #              u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"]
    sentences = ReadFile.readTXTFile("./BitermTopicModel/document_corpus.txt")

    queries = getQueries(sentences[:100])
    for query in queries:
        for q in query:
            print q,
        print
    docs_topic = getQueriySimilarly(queries)
    # for topic in docs_topic:
    #     for re in topic:
    #         print re,1
    #     print
    results = groupByTopic(docs_topic,sentences)
    sh = Show()
    sh.showDocTopicResult(results)
    Wr = WriteResult()
    Wr.WriteTopicResult(results)
Exemplo n.º 24
0
def main():
    # 老版本
    # Lda_model = LdaTopic(modelName='LDAsimiWord_5')
    # # print Lda_model[0]
    # lda_model = Lda_model[0]
    # words = Lda_model[1]
    # n_top_words = 1
    # for i ,topic_dist in enumerate(lda_model.topic_word_):
    #     topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    #
    # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6')
    # # print Lda_model[0]
    # lda_model2 = Lda_model2[0]
    # words2 = Lda_model2[1]
    # n_top_words2 = 1
    # for i, topic_dist in enumerate(lda_model2.topic_word_):
    #     topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    # documents = getFileSentences()

    # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\
    #              u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\
    #              u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\
    #              u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"]
    sentences = ReadFile.readTXTFile("./BitermTopicModel/document_corpus.txt")

    queries = getQueries(sentences[:100])
    for query in queries:
        for q in query:
            print q,
        print
    docs_topic = getQueriySimilarly(queries)
    # for topic in docs_topic:
    #     for re in topic:
    #         print re,1
    #     print
    results = groupByTopic(docs_topic, sentences)
    sh = Show()
    sh.showDocTopicResult(results)
    Wr = WriteResult()
    Wr.WriteTopicResult(results)
Exemplo n.º 25
0
def main():
	# train_sentences = [
	# 	'0无偿居间介绍买卖毒品的行为应如何定性',
	# 	'1吸毒男动态持有大量毒品的行为该如何认定',
	# 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
	# 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
	# 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
	# 	'5为获报酬帮人购买毒品的行为该如何认定',
	# 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
	# 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
	# 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
	# 	'9一方未签字办理的结婚登记是否有效',
	# 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
	# 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
	# 	'12身份证被别人冒用无法登记结婚怎么办?',
	# 	'13同居后又与他人登记结婚是否构成重婚罪',
	# 	'14未办登记只举办结婚仪式可起诉离婚吗',
	# 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
	# ]
	filename = config.SimilarlySentencePath+"AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	for sen in test_sentences_doc:
		sen_iterms = sen.strip().split("\t")
		if len(sen_iterms) >= 2:
			test_sentences.append(sen_iterms[1])
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
	print type(train_sentences[0])
	docs = LabelSentences(filename=None,sentences=train_sentences)
	# docs = LabelSentences.LabelSentences(sentences=train_sentences)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt")

	# train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "")
	# print len(sentences)
	# train_sentences = []
	# for sen in sentences:
	# 	sen_iterms = sen.split("\t")
	# 	if len(sen_iterms) == 2:
	# 		print sen_iterms[1]
	# 		train_sentences.append(sen_iterms[1])
	# test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt")
	# test_sentences = ['周涛知道是谁吗']
	test_sentences = train_sentences[:100]
	SSO = Doc2VecObj()
	# corpus = SSO.getCorpus(docs)
	# SSO.buildModel(docs)
	# SSO.save()

	# load model
	SSO.load()
	result = SSO.calSentencesSimilarly(test_sentences,train_sentences)
	Wr = WriteResult()
	can_not_deal = Wr.WriteSimilarlySentence(result,"Doc2Vec_simi.txt")
Exemplo n.º 26
0
def train_lad():
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
    # line = LineSetence(sentences=sentences)
    lda.buildModel(lines(sentences), num_topics=21)
Exemplo n.º 27
0
def getQuestionFile(Q_R_sentences):
    QIdfile = MyCode.config.CaiCaiDataPath + "AllQueriesWithID.txt"
    QnIdfile = MyCode.config.CaiCaiPath + "AllQueriesWithID.txt"
    mapQRfile = MyCode.config.CaiCaiDataPath + "AllQueryResponseIdMap.txt"
    mapnQRfile = MyCode.config.CaiCaiPath + "AllQueryResponseIdMap.txt"
    RIdfile = MyCode.config.CaiCaiDataPath + "AllResponsesWithID.txt"
    RnIdfile = MyCode.config.CaiCaiPath + "AllResponsesWithID.txt"

    Q_sentences = ReadFile.readTXTFile(QIdfile)
    QR_map = []
    OldMapQid = {}
    MapRId = {}
    MapQid = {}
    r_id = 1
    q_id = 1
    OldRIdSentences = ReadFile.readTXTFile(RIdfile)
    R_sens = []
    for line in OldRIdSentences:
        R_sens.append(line.strip().split("\t")[1])

    R_sens = list(set(R_sens))

    for qr_sentence in Q_R_sentences:
        csentence = []
        exist = False
        for sentence in Q_sentences:
            csentence = sentence.strip().split('\t')
            line = csentence[1].replace(' ','')
            if line == qr_sentence[0]:
                exist = True
                break
        if exist:
            sq_id = csentence[0]
            OldMapQid.setdefault(sq_id,'')
        else:
            sq_id = "CAICAI_Q_"+str(time.strftime("%Y%m%d%H%M", time.localtime()))+"%05d"%q_id
            q_id += 1
            MapQid.setdefault(sq_id,qr_sentence[1].replace(' ',''))
        for i in xrange(2, 5):
            if qr_sentence[i] in R_sens:
                continue
            if qr_sentence[i] != '' and len(qr_sentence[i]) > 2:
                print qr_sentence[i]
                sr_id = 'CAICAI_R_'+str(time.strftime("%Y%m%d%H%M", time.localtime()))+'%05d' % r_id
                QR_map.append((sq_id, sr_id))
                MapRId.setdefault(sr_id,qr_sentence[i])
                r_id += 1

    fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt'
    # 重写Questions 文件
    with open(fileEnd,'w') as fp:
        # print len(OldMapQid.keys())
        for sen in Q_sentences:
            lines = sen.split('\t')[0]
            if OldMapQid.has_key(lines):
                fp.write(sen[:-2]+',"client_id": "c_00000007"}\n')
            else:
                fp.write(sen)
    # 结果写入文件
    with open(QnIdfile,'w') as fp:
        MapQid = sorted(MapQid.iteritems(),key=lambda asd:asd[0],reverse=False)
        for id in  MapQid:
            fp.write(id[0]+'\t'+id[1]+"\n")
    with open(mapnQRfile,'w') as fp:
        sen = ReadFile.readTXTFile(mapQRfile)
        for s in sen:
            lines = s.split('\t')
            print lines
            QR_map.append((lines[0],lines[1][:-1]))
        QR_map = list(set(QR_map))
        for qr in sorted(QR_map,key=lambda asd:asd[0],reverse=False):
            fp.write(qr[0]+'\t'+qr[1]+'\n')

    with open(RnIdfile,'w') as fp:
        MapRId = sorted(MapRId.iteritems(),key=lambda asd:asd[0],reverse=False)
        for id in MapRId:
            fp.write(id[0]+'\t'+id[1].strip()+'\t{"client_id": "c_00000007"}\n')
Exemplo n.º 28
0
def main():
    # train_sentences = [
    # 	'0无偿居间介绍买卖毒品的行为应如何定性',
    # 	'1吸毒男动态持有大量毒品的行为该如何认定',
    # 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
    # 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
    # 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
    # 	'5为获报酬帮人购买毒品的行为该如何认定',
    # 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
    # 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
    # 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
    # 	'9一方未签字办理的结婚登记是否有效',
    # 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
    # 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
    # 	'12身份证被别人冒用无法登记结婚怎么办?',
    # 	'13同居后又与他人登记结婚是否构成重婚罪',
    # 	'14未办登记只举办结婚仪式可起诉离婚吗',
    # 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
    # ]
    filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
    sentences = ReadFile.readTXTFile(filename)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                              "corpus_0829.txt")
    test_sentences = []
    for sen in test_sentences_doc:
        sen_iterms = sen.strip().split("\t")
        if len(sen_iterms) >= 2:
            test_sentences.append(sen_iterms[1])
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    docs = LabelSentences(filename=None, sentences=train_sentences)
    # docs = LabelSentences.LabelSentences(sentences=train_sentences)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt")

    # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "")
    # print len(sentences)
    # train_sentences = []
    # for sen in sentences:
    # 	sen_iterms = sen.split("\t")
    # 	if len(sen_iterms) == 2:
    # 		print sen_iterms[1]
    # 		train_sentences.append(sen_iterms[1])
    # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt")
    # test_sentences = ['周涛知道是谁吗']
    test_sentences = train_sentences[:100]
    SSO = Doc2VecObj()
    # corpus = SSO.getCorpus(docs)
    # SSO.buildModel(docs)
    # SSO.save()

    print " load model"
    SSO.load()
    value = SSO.similarly(u"早起吃的油条,很好吃。", u"今天吃什么")
    # result = SSO.most_similarSentence(test_sentences[9],test_sentences[:200],topn=10)
    # print test_sentences[9]
    # for re in result:
    # 	print re[0],re[1]
    print "similarly : ", value
Exemplo n.º 29
0
def train_lad():
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
	# line = LineSetence(sentences=sentences)
	lda.buildModel(lines(sentences), num_topics=21)