def wordlabel_vector(self, wordlabels):
     added_word_dict = dict()
     word_vector = np.array([0] * self.worddict.vocabulary_size(),
                            dtype=np.float32)
     pos_vector = np.array([0] * self.posdict.vocabulary_size(),
                           dtype=np.float32)
     for wordlabel in wordlabels:
         word = wordlabel[0].lower().strip(
             "#") if self.capignore else wordlabel[0]
         # word = get_root_word(word) if wordlabel[2] in self.verb else word
         # if not wordlabel[0].lower().strip("#") == word:
         #     print(wordlabel[2], wordlabel[0].lower().strip("#"), '->', word)
         if not (pu.is_valid_keyword(word)
                 and self.is_valid_wordlabel(wordlabel)):
             continue
         if word in added_word_dict:
             continue
         added_word_dict[word] = True
         if not self.worddict.is_word_in_dict(word):
             pos_tag = wordlabel[2]
             pos_vector[self.posdict.word2id(pos_tag)] += 1
         else:
             wordid = self.worddict.word_2_id(word)
             word_vector[wordid] = self.worddict.dictionary[word]['idf']
     added_word = sorted(added_word_dict.keys())
     added_entity = sorted(
         [1 for w in wordlabels if not self.is_valid_wordlabel(w)])
     return word_vector, added_word, len(added_entity)
Пример #2
0
 def clean_text(self, text):
     tokens = text.split()
     for i in range(len(tokens) - 1, -1, -1):
         tokens[i] = tokens[i].lower().strip()
         if not pu.is_valid_keyword(tokens[i]):
             del tokens[i]
     return tokens
Пример #3
0
def get_semantic_tokens(file_list):
    pos_type_info = {
        ark.prop_label: {
            K_IFD: IdFreqDict()
        },
        ark.comm_label: {
            K_IFD: IdFreqDict()
        },
        ark.verb_label: {
            K_IFD: IdFreqDict()
        },
        ark.hstg_label: {
            K_IFD: IdFreqDict()
        },
    }
    total_doc_num = 0
    for file in file_list:
        twarr = ark.twarr_ark(fu.load_array(file))
        total_doc_num += len(twarr)
        pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr])
        for pos_token in pos_tokens:
            word = pos_token[0].strip().lower()
            if len(word) <= 2 or not pu.is_valid_keyword(word):
                continue
            real_label = ark.pos_token2semantic_label(pos_token)
            if real_label:
                pos_type_info[real_label][K_IFD].count_word(word)
    return pos_type_info, total_doc_num
Пример #4
0
 def tokenize(self):
     # tokens = (t.text.lower() for t in self.tw[tk.key_spacy])
     tokens = pu.findall(pu.tokenize_pattern, self.tw[tk.key_text].lower())
     tokens = [
         t.strip() for t in tokens
         if pu.is_valid_keyword(t) and not pu.is_stop_word(t)
     ]
     for token in tokens:
         self.tokens.count_word(token)
 def categorize(self, doc):
     for token in doc:
         word = token.text.strip().lower()
         token_tag = token.pos_
         if not pu.is_valid_keyword(word):
             continue
         if word.startswith('#'):
             self.type_ifd_dict[su.pos_hstg].count_word(word)
         elif token_tag in TokenSet.NORM_SET:
             self.type_ifd_dict[token_tag].count_word(word)
 def expand_dict_and_count_df_from_wordlabel(self, wordlabels):
     added_word_dict = dict()
     for wordlabel in wordlabels:
         word = wordlabel[0].lower().strip(
             "#") if self.capignore else wordlabel[0]
         # word = get_root_word(word) if wordlabel[2] in self.verb else word
         if not (pu.is_valid_keyword(word)
                 and self.is_valid_wordlabel(wordlabel)):
             continue
         else:
             if word in added_word_dict:
                 continue
             added_word_dict[word] = True
             # "word" is now neither entity nor invalid keyword_info or duplicated word by now
             self.worddict.expand_dict_from_word(word)
             if 'df' not in self.worddict.dictionary[word]:
                 self.worddict.dictionary[word]['df'] = 1
             else:
                 self.worddict.dictionary[word]['df'] += 1
     self.doc_num += 1
def is_valid_keyword(word):
    return pu.is_valid_keyword(word)