def get(num): global cut fileList = os.listdir(inpath) fout = open(outpath, 'a') n = 0 for f in fileList: fin = open(inpath + f, 'r') line = fin.readline() while line: line = json.loads(line) ans = [] if line['meta']['crit'] != 16 and line['meta']['crit'] != 19: line = fin.readline() continue for s in line['content']: sentence = ''.join(s) sentence = thulac.cut(sentence) ans.append(sentence) print(json.dumps(ans, ensure_ascii=False), file=fout) n += 1 if n == num: return line = fin.readline() fin.close()
def generate_sentence(self,inputs): inputs = [ token[0] for token in thulac.cut(inputs) ] inputs = [self.sentence_start_token] + inputs + [self.sentence_end_token] inputs = self.word2index(inputs) # We start the sentence with the start token new_sentence = inputs # Repeat until we get an end token while not new_sentence[-1] == self.words_index[self.unknown_token]: next_word_probs = self.predict_next_prob(new_sentence) sampled_word = self.words_index[unknown_token] # We don't want to sample unknown words while sampled_word == self.words_index[unknown_token]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) if sampled_word == self.words_index[self.sentence_end_token]: break new_sentence.append(sampled_word) sentence_str = [self.words[x] for x in new_sentence[1:]] sentence_str = "".join(sentence_str) sentence_str = sentence.split(self.sentence_end_token) return sentence_str[1]
def preprocessing(self, discourse): for paragraph in discourse: for sentence in paragraph.iterfind( filter=node_type_filter(Sentence)): if self.ctb and (sentence.sid is not None) and (sentence.sid in self.ctb): parse = self.ctb[sentence.sid] pairs = [(node[0], node.label()) for node in parse.subtrees() if node.height() == 2 and node.label() != "-NONE-" ] words, tags = list(zip(*pairs)) else: words, tags = list(zip(*thulac.cut(sentence.text))) setattr(sentence, "words", list(words)) setattr(sentence, "tags", list(tags)) offset = 0 for textnode in sentence.iterfind( filter=node_type_filter([TEXT, Connective, EDU]), terminal=node_type_filter([TEXT, Connective, EDU])): if isinstance(textnode, EDU): edu_words = [] edu_tags = [] cur = 0 for word, tag in zip(sentence.words, sentence.tags): if offset <= cur < cur + len(word) <= offset + len( textnode.text): edu_words.append(word) edu_tags.append(tag) cur += len(word) setattr(textnode, "words", edu_words) setattr(textnode, "tags", edu_tags) offset += len(textnode.text) return discourse
with open("BosonNLP_sentiment_score.txt", "r") as f: for line in f.readlines(): tmp = line.split() if len(tmp) != 0: dict.append(tmp[0]) emo_dict[tmp[0]] = tmp[1] #with open("dict", "w") as g: # for word in dict: # g.write(word+"\n") thulac = thulac.thulac(user_dict='dict') dom = xml.dom.minidom.parse('ipad.xml') root = dom.documentElement sentence = root.getElementsByTagName('sentence') for line in sentence: score = 0 tmp = {} #tmp['id'] = line.getAttribute("id") tmp['opinionated'] = line.getAttribute("opinionated") list = thulac.cut(line.firstChild.data) for word in list: score += float(emo_dict.get(word[0], 0)) score /= len(list) tmp['score'] = score sen_result.append(tmp) print(sen_result)
def __call__(self, batch, *args, **kwargs): if len(batch) > 0 and isinstance(batch[0], str): # s2 = thu1.cut(s1, text=True) # print(model([s2.split(" ")])) batch = [thu1.cut(utt, text=True).split(" ") for utt in batch] return batch