Exemplo n.º 1
0
def main():
	sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	for sen in test_sentences_doc:
		sen_iterms = sen.strip().split("\t")
		if len(sen_iterms) >= 2:
			test_sentences.append(sen_iterms[1])

	# train_sentences = [
	# 	'0无偿居间介绍买卖毒品的行为应如何定性',
	# 	'1吸毒男动态持有大量毒品的行为该如何认定',
	# 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
	# 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
	# 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
	# 	'5为获报酬帮人购买毒品的行为该如何认定',
	# 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
	# 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
	# 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
	# 	'9一方未签字办理的结婚登记是否有效',
	# 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
	# 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
	# 	'12身份证被别人冒用无法登记结婚怎么办?',
	# 	'13同居后又与他人登记结婚是否构成重婚罪',
	# 	'14未办登记只举办结婚仪式可起诉离婚吗',
	# 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
	# ]
	# print type(train_sentences[0])
	# print len(sentences)
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ",""))
	print type(train_sentences[0])
	#
	# print "build simi_model"
	SSO = SentenceSimilarlyObj()

	# corpus = SSO.getCorpus(train_sentences)
	# SSO.setSimilar(corpus=corpus)
	# print "save simi model"
	# SSO.save()
	# SSO.save("simi_model_little","word_dic_little")
	# print "build success"

	print "load model"
	SSO.load()
	# print SSO.similar

	print "test"
	# indexs = SSO.calSentenceSimilarly(sentence=u"说说后天是礼拜几")
	# for index in indexs:
	# 	print index[0],train_sentences[index[0]],index[1]
	result = SSO.calSentencesSimilarly(train_sentences,train_sentences)
	Wr = WriteResult()
	can_not_deal = Wr.WriteSimilarlySentence(result,"docSim_simi.txt")
Exemplo n.º 2
0
def main():
    sentences = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                     "AllQueriesWithID.txt")
    test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                              "corpus_0829.txt")
    test_sentences = []
    for sen in test_sentences_doc:
        sen_iterms = sen.strip().split("\t")
        if len(sen_iterms) >= 2:
            test_sentences.append(sen_iterms[1])

    # train_sentences = [
    # 	'0无偿居间介绍买卖毒品的行为应如何定性',
    # 	'1吸毒男动态持有大量毒品的行为该如何认定',
    # 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
    # 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
    # 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
    # 	'5为获报酬帮人购买毒品的行为该如何认定',
    # 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
    # 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
    # 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
    # 	'9一方未签字办理的结婚登记是否有效',
    # 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
    # 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
    # 	'12身份证被别人冒用无法登记结婚怎么办?',
    # 	'13同居后又与他人登记结婚是否构成重婚罪',
    # 	'14未办登记只举办结婚仪式可起诉离婚吗',
    # 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
    # ]
    # print type(train_sentences[0])
    # print len(sentences)
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    #
    # print "build simi_model"
    SSO = SentenceSimilarlyObj()

    corpus = SSO.getCorpus(train_sentences)
    SSO.setSimilar(corpus=corpus)
    print "save simi model"
    # SSO.save()
    # SSO.save("simi_model_little","word_dic_little")
    # print "build success"

    print "load model"
    SSO.load()
    # print SSO.similar

    print "test"
Exemplo n.º 3
0
def analysisSimilary_Word(model,
                          file=MyCode.config.CorpusFilePath + "favor0721.xlsx",
                          sheet='节日210'):
    sentences, R_S = ReadFile.getOneSheetContext(file, sheet)
    cla_Key_words = []
    for sentence in sentences:
        # print sentence
        key_sentence = jieba.analyse.extract_tags(sentence)
        for w in key_sentence:
            cla_Key_words.append(w)
    cla_Key_words = list(set(cla_Key_words))
    all_key_words = []
    for word in cla_Key_words:
        try:
            if model.similarity(str("节日"), str(word)) < 0.4:
                continue
            all_key_words.append(word)
            simi_words = model.most_similar(str(unicode(word)))
        except KeyError:
            continue
        for w in simi_words:
            all_key_words.append(w[0])
    all_key_words = list(set(all_key_words))
    all_key_words = extend_Word(model, all_key_words)
    # write to file
    with open(MyCode.config.ResultFilePath + "Topic_festival.txt", 'w') as fp:
        for key_word in all_key_words:
            fp.writelines(key_word + "\n")
Exemplo n.º 4
0
 def getNewWords(self):
     file = config.WordDicPath + "birds.txt"
     lines = ReadFile.readTXTFile(file)
     words = []
     for line in lines:
         words.extend(line.strip().split(" "))
     return words
Exemplo n.º 5
0
def analysisSimilary_Word(model, file=MyCode.config.CorpusFilePath + "favor0721.xlsx", sheet='节日210'):
    sentences,R_S = ReadFile.getOneSheetContext(file, sheet)
    cla_Key_words = []
    for sentence in sentences:
        # print sentence
        key_sentence = jieba.analyse.extract_tags(sentence)
        for w in key_sentence:
            cla_Key_words.append(w)
    cla_Key_words = list(set(cla_Key_words))
    all_key_words = []
    for word in cla_Key_words:
        try:
            if model.similarity(str("节日"),str(word)) < 0.4:
                continue
            all_key_words.append(word)
            simi_words = model.most_similar(str(unicode(word)))
        except KeyError:
            continue
        for w in simi_words:
            all_key_words.append(w[0])
    all_key_words = list(set(all_key_words))
    all_key_words = extend_Word(model,all_key_words)
    # write to file
    with open(MyCode.config.ResultFilePath+ "Topic_festival.txt", 'w') as fp:
        for key_word in all_key_words:
            fp.writelines(key_word+"\n")
Exemplo n.º 6
0
def MyTest():
    print "1"
    filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
    sentences = ReadFile.readTXTFile(filename)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    # test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
    test_sentences = []
    # for sen in test_sentences_doc:
    # 	sen_iterms = sen.strip().split("\t")
    # 	if len(sen_iterms) >= 2:
    # 		test_sentences.append(sen_iterms[1])
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    test_sentences = train_sentences

    tsf = Ranker()
    tsf.load(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    result = tsf.getSimilarSentences(test_sentences)

    wr = WriteResult()
    wr.WriteSimilarSentence(result,
                            file=config.SimilarlySentencePath +
                            "rank_simi.txt")
Exemplo n.º 7
0
def MyTest():
	print "1"
	filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	# test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	# for sen in test_sentences_doc:
	# 	sen_iterms = sen.strip().split("\t")
	# 	if len(sen_iterms) >= 2:
	# 		test_sentences.append(sen_iterms[1])
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
	print type(train_sentences[0])
	test_sentences = train_sentences

	tsf = Ranker()
	tsf.load(config.SimilarlySentencePath+"AllQueriesWithID.txt")
	result= tsf.getSimilarSentences(test_sentences[:100])

	wr = WriteResult()
	wr.WriteSimilarSentence(result,file=config.SimilarlySentencePath+"rank_simi.txt")
Exemplo n.º 8
0
 def getNewWords(self):
     file = config.WordDicPath + "birds.txt"
     lines = ReadFile.readTXTFile(file)
     words = []
     for line in lines:
         words.extend(line.strip().split(" "))
     return words
Exemplo n.º 9
0
def getFileSentence():
    filepath = "../Data/corpus/"
    files = ['favor0721.xlsx', 'inter0721.xlsx', 'Sentence_QR_pair_0714.xlsx']
    sentences = []
    for file in files:
        Q, R = ReadFile.getFileSentence(filepath + file)
        sentences.extend(Q)
    return sentences
Exemplo n.º 10
0
def getFileSentence():
    filepath = "../Data/corpus/"
    files = ['favor0721.xlsx','inter0721.xlsx','Sentence_QR_pair_0714.xlsx']
    sentences = []
    for file in files:
        Q,R = ReadFile.getFileSentence(filepath + file)
        sentences.extend(Q)
    return sentences
Exemplo n.º 11
0
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath):
    # files = ReadFile.getAllFilesInDir(dirPath)
    # sentences = ReadFile.getAllFilesContext(files,dirPath)
    #for test
    sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt')
    par_sentences = Participle.Participle(sentences[:10])
    par_filter_sentences = filterStopWords.filterStopWords(par_sentences)
    # return wordTostr(par_filter_sentences)
    return par_filter_sentences
Exemplo n.º 12
0
def getFileContext_Participle(dirPath=MyCode.config.CorpusFilePath):
    # files = ReadFile.getAllFilesInDir(dirPath)
    # sentences = ReadFile.getAllFilesContext(files,dirPath)
    #for test
    sentences = ReadFile.readTXTFile(dirPath + 'corpus.txt')
    par_sentences = Participle.Participle(sentences[:10])
    par_filter_sentences = filterStopWords.filterStopWords(par_sentences)
    # return wordTostr(par_filter_sentences)
    return par_filter_sentences
Exemplo n.º 13
0
def main():
	sentences = ReadFile.readTXTFile(config.TopicFilePath+"QRpair.txt")
	qaQueryPairTopic = QAQueryPairTopic()
	result = qaQueryPairTopic.getgetQAQueriesTopicId(sentences)
	wr = WriteResult()
	wr.WriteTopicRegular(result[0])
	wr.WriteTopic(result[1])
	result = qaQueryPairTopic.getResponsesTopic(sentences)
	wr.WriteResponseWithTopicId(result)
Exemplo n.º 14
0
def getFileSentences(subfilename="souhu_fenci"):
    sentences = ReadFile.read_souhu_fenci_file(subfilename=subfilename)
    logging.info("data size :%d" % len(sentences))
    print("data size :%d" % len(sentences))
    par_sentences = []
    for sentence in sentences:
        snetence_words = sentence.split(" ")
        par_sentences.append(snetence_words)
    return par_sentences
Exemplo n.º 15
0
def getFileSentences(subfilename="souhu_fenci"):
	sentences = ReadFile.read_souhu_fenci_file(subfilename=subfilename)
	logging.info("data size :%d" % len(sentences))
	print("data size :%d" % len(sentences))
	par_sentences = []
	for sentence in sentences:
		snetence_words = sentence.split(" ")
		par_sentences.append(snetence_words)
	return par_sentences
Exemplo n.º 16
0
def ExtracKeyWordFromSentence(sentencefile =MyCode.config.SentenceKeyWordPath + "badcase.xlsx"):
    print "Extrc key words from sentence"
    sentences = ReadFile.getFileSentence(sentencefile)[0]
    sentences = list(set(sentences))
    tag_sentence = get_tagByjieba(sentences)
    writeToFile(sentences,tag_sentence)
    sentences_keywords = []
    for i in xrange(len(sentences)):
        sentences_keywords.append((sentences[i],tag_sentence[i]))
    return sentences_keywords
Exemplo n.º 17
0
def insertDicItem():
    file = MyCode.config.CaiCaiPath + 'AllQueriesWithID_mid2.txt'
    fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt'
    sentences = ReadFile.readTXTFile(file)
    with open(fileEnd,'a+') as fp:
        for sen in sentences:
            lines = sen.split("\t")
            lines[2] = lines[2][:-3]+', "client_id": "c_00000007"}'
            # print lines[2]
            fp.write(lines[0] +"\t"+lines[1]+'\t'+lines[2]+'\n')
Exemplo n.º 18
0
def ExtracKeyWordFromSentence(sentencefile=MyCode.config.SentenceKeyWordPath +
                              "badcase.xlsx"):
    print "Extrc key words from sentence"
    sentences = ReadFile.getFileSentence(sentencefile)[0]
    sentences = list(set(sentences))
    tag_sentence = get_tagByjieba(sentences)
    writeToFile(sentences, tag_sentence)
    sentences_keywords = []
    for i in xrange(len(sentences)):
        sentences_keywords.append((sentences[i], tag_sentence[i]))
    return sentences_keywords
Exemplo n.º 19
0
def buildWordfromExcel():
    file = "../Data/Sentence_QR_pair_0714.xlsx"
    Q_s, R_s = ReadFile.getFileSentence(file)
    sentences = Participle.Participle(Q_s)
    Q_sentences = []
    for s in sentences:
        snetence = ''
        for word in s:
            snetence += word + ' '
        Q_sentences.append(snetence)
    return Q_sentences
Exemplo n.º 20
0
def buildWordfromExcel():
    file = "../Data/Sentence_QR_pair_0714.xlsx"
    Q_s,R_s = ReadFile.getFileSentence(file)
    sentences = Participle.Participle(Q_s)
    Q_sentences = []
    for s in sentences:
        snetence = ''
        for word in s:
            snetence += word + ' '
        Q_sentences.append(snetence)
    return Q_sentences
Exemplo n.º 21
0
 def TagSenetnceTest(self,method=2):
     tag_sentence = Sentence_Tag()
     sentences = ReadFile.getQueriesWithId('AllQueriesWithID')
     re_sentences = tag_sentence.tagSentences(sentences[:100],method=method)
     for sen in re_sentences:
         if method == 1:
             for w in sen:
                 print w[0],w[1]
             print
         else:
             print sen
Exemplo n.º 22
0
 def TagSenetnceTest(self, method=2):
     tag_sentence = Sentence_Tag()
     sentences = ReadFile.getQueriesWithId('AllQueriesWithID')
     re_sentences = tag_sentence.tagSentences(sentences[:100],
                                              method=method)
     for sen in re_sentences:
         if method == 1:
             for w in sen:
                 print w[0], w[1]
             print
         else:
             print sen
Exemplo n.º 23
0
def Topic2Vec_v2():
    """
	分析句子在,将句子转换为topic 向量
	:return:
	"""
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData +
                                     "topic_data_processed.txt")
    docs = []
    lab = []
    for index, line in enumerate(sentences):
        term = line.strip().split("\t")
        if len(term) != 3:
            continue

        docs.append(term[1])
        lab.append(term[2])
    documents = line_Cut_Word(docs)
    documents = [" ".join(doc) for doc in documents]

    lda.load_word_dic()
    lda.load_LdaModel()
    # lda.build_word_dic(lines(documents))
    # print len(lda.word_dic.keys())
    # lda.buildModel(lines(documents))

    result_lab = []
    topic2vec = []
    x_index, y_index = [], []
    count = 0
    print len(lab)
    for index, doc_lab in enumerate(list(zip(docs, lab))):
        if index % 1000 == 0 and index != 0:
            print doc_lab[0], doc_lab[1]
            # break
        doc = doc_lab[0]
        la = doc_lab[1]
        topics = lda.getQuerySimilarly(doc)

        if topics:
            # print doc, "\t", la
            for topic in topics:
                x_index.append(count)
                y_index.append(topic[0])
                topic2vec.append(topic[1])
            count += 1
            result_lab.append(la)

    print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

    result = [x_index, y_index, topic2vec, result_lab]
    with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
        cPickle.dump(result, fp)
Exemplo n.º 24
0
def Topic2Vec_v2():
	"""
	分析句子在,将句子转换为topic 向量
	:return:
	"""
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt")
	docs = []
	lab = []
	for index, line in enumerate(sentences):
		term = line.strip().split("\t")
		if len(term) != 3:
			continue

		docs.append(term[1])
		lab.append(term[2])
	documents = line_Cut_Word(docs)
	documents = [" ".join(doc) for doc in documents]

	lda.load_word_dic()
	lda.load_LdaModel()
	# lda.build_word_dic(lines(documents))
	# print len(lda.word_dic.keys())
	# lda.buildModel(lines(documents))

	result_lab = []
	topic2vec = []
	x_index, y_index = [], []
	count = 0
	print len(lab)
	for index, doc_lab in enumerate(list(zip(docs, lab))):
		if index % 1000 == 0 and index != 0:
			print doc_lab[0], doc_lab[1]
			# break
		doc = doc_lab[0]
		la = doc_lab[1]
		topics = lda.getQuerySimilarly(doc)

		if topics:
			# print doc, "\t", la
			for topic in topics:
				x_index.append(count)
				y_index.append(topic[0])
				topic2vec.append(topic[1])
			count += 1
			result_lab.append(la)

	print len(x_index), len(y_index), len(topic2vec), len(result_lab), count

	result = [x_index, y_index, topic2vec, result_lab]
	with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp:
		cPickle.dump(result, fp)
Exemplo n.º 25
0
    def TestSentencesStruct(self):
        method = 2
        tag_sentence = Sentence_Tag()
        sentences = ReadFile.getQueriesWithId('AllQueriesWithID')
        sentenceStruct = SentenceStruct()

        tag_sentences = tag_sentence.tagSentences(sentences,method)
        sentences_class_list = sentenceStruct.SimiSentenceStruct(tag_sentences)
        # sh = Show()
        # sh.showSenetenceStructResult(sentences_class_list)
        # wr = WriteResult()
        # wr.WriteSentenceStruct(sentences_class_list)
        return sentences_class_list
Exemplo n.º 26
0
    def TestSentencesStruct(self):
        method = 2
        tag_sentence = Sentence_Tag()
        sentences = ReadFile.getQueriesWithId('AllQueriesWithID')
        sentenceStruct = SentenceStruct()

        tag_sentences = tag_sentence.tagSentences(sentences, method)
        sentences_class_list = sentenceStruct.SimiSentenceStruct(tag_sentences)
        # sh = Show()
        # sh.showSenetenceStructResult(sentences_class_list)
        # wr = WriteResult()
        # wr.WriteSentenceStruct(sentences_class_list)
        return sentences_class_list
Exemplo n.º 27
0
 def getWords(self):
     file = config.Semantic_dicPath + "semantic_wordgroup_new.txt"
     words = {}
     sentences = ReadFile.readTXTFile(file)
     for sen in sentences:
         items = sen.strip().split(' ')
         if len(items) < 2:
             continue
         if words.has_key(items[1]):
             words[items[1]].append(items[0])
         else:
             words[items[1]] = [items[0]]
     return words
Exemplo n.º 28
0
 def TestTopic(self):
     Ta = TopicAnalysis()
     sentences = ReadFile.readTXTFile(config.TopicFilePath+"test_topic.txt")
     for text in sentences:
         # text = raw_input('query:\n')
         print "问 :"+text
         sentence = Sentence()
         sentence.text = text.strip()
         response = Ta.getResponse(sentence)
         if response:
             print "答 :"+response
         else:
             print "没有合适转移话题!"
Exemplo n.º 29
0
def buildModel(Name = "wordRank_filter"):
    file = MyCode.config.ModelPath + Name + '.model'
    model = None
    try:
        model = word2vec.Word2Vec.load(file)
    except :
        # word_sentences = getFileContext_Participle()
        # sentences = wordTostr(word_sentences)
        # sentences = TextIter()TextIter
        sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath + 'souhu_fenci.txt')
        model = word2vec.Word2Vec(sentences[:10],min_count=1,workers=8)
        model.save(file)
    return model
Exemplo n.º 30
0
def filter(file=config.WordDicPath + "semantic_wordgroup_new.txt"):
    word_sentences = ReadFile.readTXTFile(file)
    word_dic = {}
    for word in word_sentences:
        iterms = word.strip().split(" ")
        if len(iterms) != 2:
            continue
        if iterms[0] in word_dic:
            if iterms[1] not in word_dic[iterms[0]]:
                word_dic[iterms[0]].append(iterms[1])
        else:
            word_dic[iterms[0]] = [iterms[1]]
    return word_dic
Exemplo n.º 31
0
 def getWords(self):
     file = config.Semantic_dicPath+"semantic_wordgroup_new.txt"
     words = {}
     sentences = ReadFile.readTXTFile(file)
     for sen in sentences:
         items = sen.strip().split(' ')
         if len(items) < 2:
             continue
         if words.has_key(items[1]):
             words[items[1]].append(items[0])
         else:
             words[items[1]] = [items[0]]
     return words
Exemplo n.º 32
0
def buildModel(Name="wordRank_filter"):
    file = MyCode.config.ModelPath + Name + '.model'
    model = None
    try:
        model = word2vec.Word2Vec.load(file)
    except:
        # word_sentences = getFileContext_Participle()
        # sentences = wordTostr(word_sentences)
        # sentences = TextIter()TextIter
        sentences = ReadFile.readTXTFile(MyCode.config.CorpusFilePath +
                                         'souhu_fenci.txt')
        model = word2vec.Word2Vec(sentences[:10], min_count=1, workers=8)
        model.save(file)
    return model
Exemplo n.º 33
0
def getQRsentences(file=MyCode.config.CaiCaiPath + "0812caicai.xlsx"):
    tables = ReadFile.readExcel(file)
    Q_R_sentences = []
    for table in tables:
        for i in xrange(1,table.nrows):
            lines = table.row_values(i)[0:]
            values = []
            for va in lines:
                values.append(va)
            # print values
            Q_R_sentences.append(values)
    # Q_par(Q_R_sentences)
    print len(Q_R_sentences)
    return Q_R_sentences
Exemplo n.º 34
0
 def TestTopic(self):
     Ta = TopicAnalysis()
     sentences = ReadFile.readTXTFile(config.TopicFilePath +
                                      "test_topic.txt")
     for text in sentences:
         # text = raw_input('query:\n')
         print "问 :" + text
         sentence = Sentence()
         sentence.text = text.strip()
         response = Ta.getResponse(sentence)
         if response:
             print "答 :" + response
         else:
             print "没有合适转移话题!"
Exemplo n.º 35
0
def buildWordFromTxt():
    file = "../Result/sentence1.txt"
    sentences = ReadFile.readTXTFile(file)
    par_Sentences = []
    par_Sentences = Participle.Participle(sentences[10000:11000])
    w_Sentence = []
    with open("../Result/fenci.txt",'w') as fp:
        for s in par_Sentences:
            p_sentence = ''
            for word in s:
                p_sentence += word + ' '
            w_Sentence.append(p_sentence)
        # print 'Start writing ... ...'
        # fp.writelines(w_Sentence)
        # print 'Finished writing !'
    return w_Sentence
Exemplo n.º 36
0
def main():
	model = Doc2VecObj()
	model.load()
	sc = SentencesClusters(20,model)
	filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))

	sc.getCluster(train_sentences[:100])
Exemplo n.º 37
0
def buildWordFromTxt():
    file = "../Result/sentence1.txt"
    sentences = ReadFile.readTXTFile(file)
    par_Sentences = []
    par_Sentences = Participle.Participle(sentences[10000:11000])
    w_Sentence = []
    with open("../Result/fenci.txt", 'w') as fp:
        for s in par_Sentences:
            p_sentence = ''
            for word in s:
                p_sentence += word + ' '
            w_Sentence.append(p_sentence)
        # print 'Start writing ... ...'
        # fp.writelines(w_Sentence)
        # print 'Finished writing !'
    return w_Sentence
Exemplo n.º 38
0
def zhwiki_segment(_config, div_size=10, remove_alpha=True):
    i = 0
    files = []
    for i in xrange(div_size):
        files.append(
            codecs.open(os.path.join(
                _config.data_path,
                _config.zhwiki_seg_t2s + str("%02d.txt" % i)),
                        'w',
                        encoding='utf-8'))
    print('Start...')
    stopWords = ReadFile.readStopWord(_config.stopWordPath + 'stop.txt')
    file_len = 0
    with codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw_t2s),
                     'r',
                     encoding='utf-8') as raw_input:
        # file_len = raw_input.
        line = raw_input.readline()
        while line:
            line = line.strip()
            i += 1
            if i % 100 == 0:
                print('line ' + str(i))
            # print(line)
            text = line.split()
            if True:
                text = [w for w in text if not is_alpha(w)]
            word_cut_seed = [jieba.cut(t) for t in text]
            tmp = ''
            for sent in word_cut_seed:
                for tok in sent:
                    if tok in stopWords:
                        continue
                    tmp += tok + ' '
            tmp = tmp.strip()
            if tmp:
                try:
                    files[i % 10].write(tmp + '\n')
                except:
                    print("file write error!")
                    continue
            line = raw_input.readline()
        for i in xrange(div_size):
            files[i].close()
Exemplo n.º 39
0
def main():
    # 老版本
    # Lda_model = LdaTopic(modelName='LDAsimiWord_5')
    # # print Lda_model[0]
    # lda_model = Lda_model[0]
    # words = Lda_model[1]
    # n_top_words = 1
    # for i ,topic_dist in enumerate(lda_model.topic_word_):
    #     topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    #
    # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6')
    # # print Lda_model[0]
    # lda_model2 = Lda_model2[0]
    # words2 = Lda_model2[1]
    # n_top_words2 = 1
    # for i, topic_dist in enumerate(lda_model2.topic_word_):
    #     topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    # documents = getFileSentences()

    # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\
    #              u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\
    #              u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\
    #              u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"]
    sentences = ReadFile.getQueriesSentence(config.SimilarlySentencePath +
                                            "AllQueriesWithID.txt")

    queries = getQueries(sentences[:100])
    for query in queries:
        for q in query:
            print q,
        print
    docs_topic = getQueriySimilarly(queries)
    # for topic in docs_topic:
    #     for re in topic:
    #         print re,1
    #     print
    results = groupByTopic(docs_topic, sentences)
    sh = Show()
    sh.showDocTopicResult(results)
    Wr = WriteResult()
    Wr.WriteTopicResult(results)
Exemplo n.º 40
0
def main():
    # 老版本
    # Lda_model = LdaTopic(modelName='LDAsimiWord_5')
    # # print Lda_model[0]
    # lda_model = Lda_model[0]
    # words = Lda_model[1]
    # n_top_words = 1
    # for i ,topic_dist in enumerate(lda_model.topic_word_):
    #     topic_words = numpy.array(words)[numpy.argsort(topic_dist)][:n_top_words:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    #
    # Lda_model2 = LdaTopic(modelName='LDAsimiWord_6')
    # # print Lda_model[0]
    # lda_model2 = Lda_model2[0]
    # words2 = Lda_model2[1]
    # n_top_words2 = 1
    # for i, topic_dist in enumerate(lda_model2.topic_word_):
    #     topic_words = numpy.array(words2)[numpy.argsort(topic_dist)][:n_top_words2:-1]
    #     print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    # documents = getFileSentences()

    # sentences = [u"你知道周杰伦是谁么?",u"周杰伦是谁?",u"你知道周杰伦吗?",u"你认识周杰伦吗?",u"你认识周杰伦么?",u"你知道周杰伦么?",\
    #              u"周杰伦知道么?",u"周杰伦知道是谁么?",u"周杰伦知道吗",u"周杰伦知道是谁吗?",u"周杰伦是谁?",u"周杰伦你认识么?",u"周杰伦你知道是谁么?",\
    #              u"周杰伦你认识吗?",u"周杰伦你知道是谁吗?",u"你认识周杰伦吗?",u"你知道周杰伦吗?",u"你知道周杰伦么?",u"你认识周杰伦么?",u"你认识周杰伦吗?",\
    #              u"你认识周杰伦是谁么?",u"你认识周杰伦是谁吗?",u"你知道周杰伦吗?",u"你知道周杰伦是谁吗?",u"你知道周杰伦是谁么?"]
    sentences = ReadFile.getQueriesSentence(config.SimilarlySentencePath + "AllQueriesWithID.txt")

    queries = getQueries(sentences[:100])
    for query in queries:
        for q in query:
            print q,
        print
    docs_topic = getQueriySimilarly(queries)
    # for topic in docs_topic:
    #     for re in topic:
    #         print re,1
    #     print
    results = groupByTopic(docs_topic,sentences)
    sh = Show()
    sh.showDocTopicResult(results)
    Wr = WriteResult()
    Wr.WriteTopicResult(results)
Exemplo n.º 41
0
def zhwiki_segment(_config,div_size=10 ,remove_alpha=True):
    i = 0
    files = []
    for i in xrange(div_size):
        files.append(codecs.open(os.path.join(_config.data_path, _config.zhwiki_seg_t2s+str("%02d.txt"%i)), 'w', encoding='utf-8'))
    print('Start...')
    stopWords = ReadFile.readStopWord(_config.stopWordPath + 'stop.txt')
    file_len = 0
    with codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw_t2s), 'r', encoding='utf-8') as raw_input:
        # file_len = raw_input.
        line = raw_input.readline()
        while line:
            line = line.strip()
            i += 1
            if i % 100 == 0:
                print('line ' + str(i))
            # print(line)
            text = line.split()
            if True:
                text = [w for w in text if not is_alpha(w)]
            word_cut_seed = [jieba.cut(t) for t in text]
            tmp = ''
            for sent in word_cut_seed:
                for tok in sent:
                    if  tok in stopWords:
                        continue
                    tmp += tok + ' '
            tmp = tmp.strip()
            if tmp:
                try :
                    files[i%10].write(tmp + '\n')
                except:
                    print("file write error!")
                    continue
            line = raw_input.readline()
        for i in xrange(div_size):
            files[i].close()
Exemplo n.º 42
0
def train_lad():
	lda = LDA()
	sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
	# line = LineSetence(sentences=sentences)
	lda.buildModel(lines(sentences), num_topics=21)
Exemplo n.º 43
0
def main():
	# train_sentences = [
	# 	'0无偿居间介绍买卖毒品的行为应如何定性',
	# 	'1吸毒男动态持有大量毒品的行为该如何认定',
	# 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
	# 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
	# 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
	# 	'5为获报酬帮人购买毒品的行为该如何认定',
	# 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
	# 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
	# 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
	# 	'9一方未签字办理的结婚登记是否有效',
	# 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
	# 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
	# 	'12身份证被别人冒用无法登记结婚怎么办?',
	# 	'13同居后又与他人登记结婚是否构成重婚罪',
	# 	'14未办登记只举办结婚仪式可起诉离婚吗',
	# 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
	# ]
	filename = config.SimilarlySentencePath+"AllQueriesWithID.txt"
	sentences = ReadFile.readTXTFile(filename)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
	test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath + "corpus_0829.txt")
	test_sentences = []
	for sen in test_sentences_doc:
		sen_iterms = sen.strip().split("\t")
		if len(sen_iterms) >= 2:
			test_sentences.append(sen_iterms[1])
	train_sentences = []
	for sen in sentences:
		sen_iterms = sen.split("\t")
		# print sen_iterms[1]
		if len(sen_iterms) >= 2:
			# print sen_iterms[1].strip().replace(" ","")
			train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
	print type(train_sentences[0])
	docs = LabelSentences(filename=None,sentences=train_sentences)
	# docs = LabelSentences.LabelSentences(sentences=train_sentences)

	# sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt")

	# train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "")
	# print len(sentences)
	# train_sentences = []
	# for sen in sentences:
	# 	sen_iterms = sen.split("\t")
	# 	if len(sen_iterms) == 2:
	# 		print sen_iterms[1]
	# 		train_sentences.append(sen_iterms[1])
	# test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt")
	# test_sentences = ['周涛知道是谁吗']
	test_sentences = train_sentences[:100]
	SSO = Doc2VecObj()
	# corpus = SSO.getCorpus(docs)
	# SSO.buildModel(docs)
	# SSO.save()

	# load model
	SSO.load()
	result = SSO.calSentencesSimilarly(test_sentences,train_sentences)
	Wr = WriteResult()
	can_not_deal = Wr.WriteSimilarlySentence(result,"Doc2Vec_simi.txt")
Exemplo n.º 44
0
def main():
    # train_sentences = [
    # 	'0无偿居间介绍买卖毒品的行为应如何定性',
    # 	'1吸毒男动态持有大量毒品的行为该如何认定',
    # 	'2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
    # 	'3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
    # 	'4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
    # 	'5为获报酬帮人购买毒品的行为该如何认定',
    # 	'6毒贩出狱后再次够买毒品途中被抓的行为认定',
    # 	'7虚夸毒品功效劝人吸食毒品的行为该如何认定',
    # 	'8妻子下落不明丈夫又与他人登记结婚是否为无效婚姻',
    # 	'9一方未签字办理的结婚登记是否有效',
    # 	'10夫妻双方1990年按农村习俗举办婚礼没有结婚证 一方可否起诉离婚',
    # 	'11结婚前对方父母出资购买的住房写我们二人的名字有效吗',
    # 	'12身份证被别人冒用无法登记结婚怎么办?',
    # 	'13同居后又与他人登记结婚是否构成重婚罪',
    # 	'14未办登记只举办结婚仪式可起诉离婚吗',
    # 	'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚'
    # ]
    filename = config.SimilarlySentencePath + "AllQueriesWithID.txt"
    sentences = ReadFile.readTXTFile(filename)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath + "AllQueriesWithID.txt")
    test_sentences_doc = ReadFile.readTXTFile(config.SimilarlySentencePath +
                                              "corpus_0829.txt")
    test_sentences = []
    for sen in test_sentences_doc:
        sen_iterms = sen.strip().split("\t")
        if len(sen_iterms) >= 2:
            test_sentences.append(sen_iterms[1])
    train_sentences = []
    for sen in sentences:
        sen_iterms = sen.split("\t")
        # print sen_iterms[1]
        if len(sen_iterms) >= 2:
            # print sen_iterms[1].strip().replace(" ","")
            train_sentences.append(sen_iterms[1].strip().replace(" ", ""))
    print type(train_sentences[0])
    docs = LabelSentences(filename=None, sentences=train_sentences)
    # docs = LabelSentences.LabelSentences(sentences=train_sentences)

    # sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829.txt")

    # train_sentences = ReadFile.getFileSentence(config.SimilarlySentencePath + "")
    # print len(sentences)
    # train_sentences = []
    # for sen in sentences:
    # 	sen_iterms = sen.split("\t")
    # 	if len(sen_iterms) == 2:
    # 		print sen_iterms[1]
    # 		train_sentences.append(sen_iterms[1])
    # test_sentences = ReadFile.readTXTFile(config.SimilarlySentencePath+"corpus_0829_t.txt")
    # test_sentences = ['周涛知道是谁吗']
    test_sentences = train_sentences[:100]
    SSO = Doc2VecObj()
    # corpus = SSO.getCorpus(docs)
    # SSO.buildModel(docs)
    # SSO.save()

    print " load model"
    SSO.load()
    value = SSO.similarly(u"早起吃的油条,很好吃。", u"今天吃什么")
    # result = SSO.most_similarSentence(test_sentences[9],test_sentences[:200],topn=10)
    # print test_sentences[9]
    # for re in result:
    # 	print re[0],re[1]
    print "similarly : ", value
Exemplo n.º 45
0
def train_lad():
    lda = LDA()
    sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt")
    # line = LineSetence(sentences=sentences)
    lda.buildModel(lines(sentences), num_topics=21)
Exemplo n.º 46
0
def getQuestionFile(Q_R_sentences):
    QIdfile = MyCode.config.CaiCaiDataPath + "AllQueriesWithID.txt"
    QnIdfile = MyCode.config.CaiCaiPath + "AllQueriesWithID.txt"
    mapQRfile = MyCode.config.CaiCaiDataPath + "AllQueryResponseIdMap.txt"
    mapnQRfile = MyCode.config.CaiCaiPath + "AllQueryResponseIdMap.txt"
    RIdfile = MyCode.config.CaiCaiDataPath + "AllResponsesWithID.txt"
    RnIdfile = MyCode.config.CaiCaiPath + "AllResponsesWithID.txt"

    Q_sentences = ReadFile.readTXTFile(QIdfile)
    QR_map = []
    OldMapQid = {}
    MapRId = {}
    MapQid = {}
    r_id = 1
    q_id = 1
    OldRIdSentences = ReadFile.readTXTFile(RIdfile)
    R_sens = []
    for line in OldRIdSentences:
        R_sens.append(line.strip().split("\t")[1])

    R_sens = list(set(R_sens))

    for qr_sentence in Q_R_sentences:
        csentence = []
        exist = False
        for sentence in Q_sentences:
            csentence = sentence.strip().split('\t')
            line = csentence[1].replace(' ','')
            if line == qr_sentence[0]:
                exist = True
                break
        if exist:
            sq_id = csentence[0]
            OldMapQid.setdefault(sq_id,'')
        else:
            sq_id = "CAICAI_Q_"+str(time.strftime("%Y%m%d%H%M", time.localtime()))+"%05d"%q_id
            q_id += 1
            MapQid.setdefault(sq_id,qr_sentence[1].replace(' ',''))
        for i in xrange(2, 5):
            if qr_sentence[i] in R_sens:
                continue
            if qr_sentence[i] != '' and len(qr_sentence[i]) > 2:
                print qr_sentence[i]
                sr_id = 'CAICAI_R_'+str(time.strftime("%Y%m%d%H%M", time.localtime()))+'%05d' % r_id
                QR_map.append((sq_id, sr_id))
                MapRId.setdefault(sr_id,qr_sentence[i])
                r_id += 1

    fileEnd = MyCode.config.CaiCaiPath + 'AllQueriesWithIDfinished.txt'
    # 重写Questions 文件
    with open(fileEnd,'w') as fp:
        # print len(OldMapQid.keys())
        for sen in Q_sentences:
            lines = sen.split('\t')[0]
            if OldMapQid.has_key(lines):
                fp.write(sen[:-2]+',"client_id": "c_00000007"}\n')
            else:
                fp.write(sen)
    # 结果写入文件
    with open(QnIdfile,'w') as fp:
        MapQid = sorted(MapQid.iteritems(),key=lambda asd:asd[0],reverse=False)
        for id in  MapQid:
            fp.write(id[0]+'\t'+id[1]+"\n")
    with open(mapnQRfile,'w') as fp:
        sen = ReadFile.readTXTFile(mapQRfile)
        for s in sen:
            lines = s.split('\t')
            print lines
            QR_map.append((lines[0],lines[1][:-1]))
        QR_map = list(set(QR_map))
        for qr in sorted(QR_map,key=lambda asd:asd[0],reverse=False):
            fp.write(qr[0]+'\t'+qr[1]+'\n')

    with open(RnIdfile,'w') as fp:
        MapRId = sorted(MapRId.iteritems(),key=lambda asd:asd[0],reverse=False)
        for id in MapRId:
            fp.write(id[0]+'\t'+id[1].strip()+'\t{"client_id": "c_00000007"}\n')