def wordlabel_vector(self, wordlabels): added_word_dict = dict() word_vector = np.array([0] * self.worddict.vocabulary_size(), dtype=np.float32) pos_vector = np.array([0] * self.posdict.vocabulary_size(), dtype=np.float32) for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word # if not wordlabel[0].lower().strip("#") == word: # print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word) if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue if word in added_word_dict: continue added_word_dict[word] = True if not self.worddict.is_word_in_dict(word): pos_tag = wordlabel[2] pos_vector[self.posdict.word2id(pos_tag)] += 1 else: wordid = self.worddict.word_2_id(word) word_vector[wordid] = self.worddict.dictionary[word]['idf'] added_word = sorted(added_word_dict.keys()) added_entity = sorted( [1 for w in wordlabels if not self.is_valid_wordlabel(w)]) return word_vector, added_word, len(added_entity)
def clean_text(self, text): tokens = text.split() for i in range(len(tokens) - 1, -1, -1): tokens[i] = tokens[i].lower().strip() if not pu.is_valid_keyword(tokens[i]): del tokens[i] return tokens
def get_semantic_tokens(file_list): pos_type_info = { ark.prop_label: { K_IFD: IdFreqDict() }, ark.comm_label: { K_IFD: IdFreqDict() }, ark.verb_label: { K_IFD: IdFreqDict() }, ark.hstg_label: { K_IFD: IdFreqDict() }, } total_doc_num = 0 for file in file_list: twarr = ark.twarr_ark(fu.load_array(file)) total_doc_num += len(twarr) pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr]) for pos_token in pos_tokens: word = pos_token[0].strip().lower() if len(word) <= 2 or not pu.is_valid_keyword(word): continue real_label = ark.pos_token2semantic_label(pos_token) if real_label: pos_type_info[real_label][K_IFD].count_word(word) return pos_type_info, total_doc_num
def tokenize(self): # tokens = (t.text.lower() for t in self.tw[tk.key_spacy]) tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower()) tokens = [ t.strip() for t in tokens if pu.is_valid_keyword(t) and not pu.is_stop_word(t) ] for token in tokens: self.tokens.count_word(token)
def categorize(self, doc): for token in doc: word = token.text.strip().lower() token_tag = token.pos_ if not pu.is_valid_keyword(word): continue if word.startswith('#'): self.type_ifd_dict[su.pos_hstg].count_word(word) elif token_tag in TokenSet.NORM_SET: self.type_ifd_dict[token_tag].count_word(word)
def expand_dict_and_count_df_from_wordlabel(self, wordlabels): added_word_dict = dict() for wordlabel in wordlabels: word = wordlabel[0].lower().strip( "#") if self.capignore else wordlabel[0] # word = get_root_word(word) if wordlabel[2] in self.verb else word if not (pu.is_valid_keyword(word) and self.is_valid_wordlabel(wordlabel)): continue else: if word in added_word_dict: continue added_word_dict[word] = True # "word" is now neither entity nor invalid keyword_info or duplicated word by now self.worddict.expand_dict_from_word(word) if 'df' not in self.worddict.dictionary[word]: self.worddict.dictionary[word]['df'] = 1 else: self.worddict.dictionary[word]['df'] += 1 self.doc_num += 1
def is_valid_keyword(word): return pu.is_valid_keyword(word)