def anno_mining_token(self, alia_base_emo=True) -> List[Example]: """ use `SO_PMI` and `Doc_Distance` to annotate suspicious mining span tokens where `Doc_Distance` as: doc_dist(w) = NDoc_pos(w) - NDoc_neg(w) Parameters ---------- alia_base_emo: bool, determine whether use base emotion vocab """ alter_tks = [x[0] for x in self.alter_tks] pos_words = Vocab.gene_from_list(self.alia_pos_words, Vocab().postive_name, 1) neg_words = Vocab.gene_from_list(self.alia_neg_words, Vocab().negtive_name, -1) if alia_base_emo: pos_words += self.base_pos_words neg_words += self.base_neg_words emo_vocab = pos_words + neg_words # filter base emo tokens alter_tks = [tk for tk in alter_tks if tk not in emo_vocab.tk2idx] # create each mat emo_vocab += Vocab.gene_from_list(alter_tks, name=Vocab().alters_name, score=0) emo_mat = token_emotion_mat(emo_vocab) label_mat = doc_label_mat(self.doc_labels) doc_mat = doc_onehot_mat(self.doc_tokens, emo_vocab) alter_idx = emo_vocab.get_group(emo_vocab.alters_name) # so_pmi so_pmi_scores_obj = pair_pmi(doc_mat, emo_mat, emo_vocab) so_pmi_scores = [exam.label for exam in so_pmi_scores_obj] # doc_distance doc_dist = np.sum(doc_mat[alter_idx] * label_mat, axis=1) pmi_dist_scores = so_pmi_scores * doc_dist # only score greater than 0 selected res_idx = np.where(pmi_dist_scores > 0)[0] res_exam = [so_pmi_scores_obj[idx] for idx in res_idx] print(f"mining new span token {len(res_exam)}") self.new_tks = res_exam return res_exam
def __init__(self, stop_words: Union[List[Text], Vocab] = None, base_pos_words: Union[List[Text], Vocab] = None, base_neg_words: Union[List[Text], Vocab] = None): """ Parameters ---------- examples : List[Example], token list for each sequence of doc stop_words: Union[List[Text], None], stop words list base_neg_words : Union[List[Text], None], base negative words base_pos_words : Union[List[Text], None], base positive words """ self.stop_words = Vocab.gene_from_list(stop_words, score = 0) if \ isinstance(stop_words, List) else stop_words self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \ isinstance(base_pos_words, List) else base_pos_words self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \ isinstance(base_neg_words, List) else base_neg_words self.seedwords = None
def __init__( self, examples: List[Example], seed_tokens: List[Example], extreme_words: Union[List[Text], Vocab], deny_words: Union[List[Text], Vocab], base_pos_words: Union[List[Text], Vocab] = None, base_neg_words: Union[List[Text], Vocab] = None, ): """ Parameters ---------- examples: List[Example], each example could extract `tokens` seed_tokens: List[Example], seed tokens mined extreme_words: List[Text], a set of extreme words deny_words: List[Text], a set of deny words base_pos_words: base positive words if needed base_neg_words: base negative words if needed """ self.doc_tokens = [exam.get("tokens") for exam in examples] self.doc_labels = [exam.label for exam in examples] self.doc_size = len(self.doc_tokens) self.seed_tokens = seed_tokens self.alia_pos_words, self.alia_neg_words = self._alia_emo_words() self.extreme_words = Vocab.gene_from_list( extreme_words, score=2) if isinstance(extreme_words, List) else extreme_words self.deny_words = Vocab.gene_from_list(deny_words) if isinstance( deny_words, List) else deny_words self.span_words = self.extreme_words + self.deny_words # vocab self.base_pos_words = Vocab.gene_from_list(base_pos_words, name = Vocab().postive_name, score = 1) if \ isinstance(base_pos_words, List) else base_pos_words self.base_neg_words = Vocab.gene_from_list(base_neg_words, name = Vocab().negtive_name, score = -1) if \ isinstance(base_neg_words, List) else base_neg_words self.alter_tks = None self.new_tks = None
def anno_seed_word(self, doc_tokens: List[List[Text]], seed_words: List[Text]) -> List[Example]: """auto annotation for seed words selected through `PMI`, where `so_pmi` would calculated from so_pmi(word) = mean(PMI(word, Pw)) - mean(PMI(word, Nw)) if so_pmi(word) > 0, the seed word would tagged as positive if so_pmi(word) = 0, tagged as neutral if so_pmi(word) < 0, tagged as negative """ _seed_words_vocab = Vocab.gene_from_list(seed_words, Vocab().alters_name, 0) emo_vocab = self.base_pos_words + self.base_neg_words + _seed_words_vocab emo_mat = token_emotion_mat(emo_vocab) doc_mat = doc_onehot_mat(doc_tokens, emo_vocab) so_pmi_score = pair_pmi(doc_mat, emo_mat, emo_vocab) return so_pmi_score
# seed_words = [Example(text = "哈哈哈", label = 1), Example(text = "卧槽", label = -1)] # # 1.2 field new word mining # new_word_op = spanNewWordMining(dataset, # seed_words, # extreme_word_dict, # deny_words_dict, # base_posword_dict, # base_negword_dict) # new_word_op.run(min_window = 2, max_window = 3, alia_base_emo = False) # ================== # test for sentence score from src.sentence_score.sentence_score import * pos_emo_dict = Vocab.gene_from_list(base_posword_dict, name=Vocab.postive_name, score=1) neg_emo_dict = Vocab.gene_from_list(base_negword_dict, name=Vocab.negtive_name, score=-1) deny_emo_dict = Vocab.gene_from_list(deny_words_dict, score=0) ext_emo_dict = extreme_word_dict emo_dict = pos_emo_dict + neg_emo_dict score_op = totalSentenceScore(tok_method=DenyExtremeTokenScore( emo_dict=emo_dict, ext_dict=ext_emo_dict, deny_dict=deny_emo_dict), seq_methods=[ transitionSentenceScore(), hypothesisSentenceScore(), tailpuncSentenceScore() ])