예제 #1
0
def get(num):
    global cut
    fileList = os.listdir(inpath)
    fout = open(outpath, 'a')
    n = 0
    for f in fileList:
        fin = open(inpath + f, 'r')
        line = fin.readline()
        while line:
            line = json.loads(line)
            ans = []
            if line['meta']['crit'] != 16 and line['meta']['crit'] != 19:
                line = fin.readline()
                continue
            for s in line['content']:
                sentence = ''.join(s)
                sentence = thulac.cut(sentence)
                ans.append(sentence)
            print(json.dumps(ans, ensure_ascii=False), file=fout)
            n += 1
            if n == num:
                return

            line = fin.readline()
        fin.close()
예제 #2
0
	def generate_sentence(self,inputs):
		inputs = [ token[0] for token in thulac.cut(inputs) ]
		inputs = [self.sentence_start_token] + inputs + [self.sentence_end_token]
		inputs = self.word2index(inputs)

		# We start the sentence with the start token
		new_sentence = inputs
		# Repeat until we get an end token
		while not new_sentence[-1] == self.words_index[self.unknown_token]:
			next_word_probs = self.predict_next_prob(new_sentence)
			sampled_word = self.words_index[unknown_token]
			# We don't want to sample unknown words
			while sampled_word == self.words_index[unknown_token]:
				samples = np.random.multinomial(1, next_word_probs[-1])
				sampled_word = np.argmax(samples)

			if sampled_word == self.words_index[self.sentence_end_token]:
				break

			new_sentence.append(sampled_word)

		sentence_str = [self.words[x] for x in new_sentence[1:]]
		sentence_str = "".join(sentence_str)
		sentence_str = sentence.split(self.sentence_end_token)

		return sentence_str[1]
    def preprocessing(self, discourse):
        for paragraph in discourse:
            for sentence in paragraph.iterfind(
                    filter=node_type_filter(Sentence)):
                if self.ctb and (sentence.sid is not None) and (sentence.sid
                                                                in self.ctb):
                    parse = self.ctb[sentence.sid]
                    pairs = [(node[0], node.label())
                             for node in parse.subtrees()
                             if node.height() == 2 and node.label() != "-NONE-"
                             ]
                    words, tags = list(zip(*pairs))
                else:
                    words, tags = list(zip(*thulac.cut(sentence.text)))
                setattr(sentence, "words", list(words))
                setattr(sentence, "tags", list(tags))

                offset = 0
                for textnode in sentence.iterfind(
                        filter=node_type_filter([TEXT, Connective, EDU]),
                        terminal=node_type_filter([TEXT, Connective, EDU])):
                    if isinstance(textnode, EDU):
                        edu_words = []
                        edu_tags = []
                        cur = 0
                        for word, tag in zip(sentence.words, sentence.tags):
                            if offset <= cur < cur + len(word) <= offset + len(
                                    textnode.text):
                                edu_words.append(word)
                                edu_tags.append(tag)
                            cur += len(word)
                        setattr(textnode, "words", edu_words)
                        setattr(textnode, "tags", edu_tags)
                    offset += len(textnode.text)
        return discourse
예제 #4
0
with open("BosonNLP_sentiment_score.txt", "r") as f:
    for line in f.readlines():
        tmp = line.split()
        if len(tmp) != 0:
            dict.append(tmp[0])
            emo_dict[tmp[0]] = tmp[1]

#with open("dict", "w") as g:
#    for word in dict:
#        g.write(word+"\n")

thulac = thulac.thulac(user_dict='dict')

dom = xml.dom.minidom.parse('ipad.xml')
root = dom.documentElement
sentence = root.getElementsByTagName('sentence')
for line in sentence:
    score = 0
    tmp = {}
    #tmp['id'] = line.getAttribute("id")
    tmp['opinionated'] = line.getAttribute("opinionated")
    list = thulac.cut(line.firstChild.data)
    for word in list:
        score += float(emo_dict.get(word[0], 0))
    score /= len(list)
    tmp['score'] = score
    sen_result.append(tmp)

print(sen_result)
예제 #5
0
 def __call__(self, batch, *args, **kwargs):
     if len(batch) > 0 and isinstance(batch[0], str):
         # s2 = thu1.cut(s1, text=True)
         # print(model([s2.split(" ")]))
         batch = [thu1.cut(utt, text=True).split(" ") for utt in batch]
     return batch