def update_tf(id_): result = gql_client.execute(note_from_id, variable_values={"id": id_}) note = result["note"] html = note["contentHtml"] text = html_text.extract_text(html) tokenizer = WordTokenizer("sudachi", mode="C", with_postag=True) words = tokenizer.tokenize(text) hiragana_re = re.compile("[\u3041-\u309F]+") number_re = re.compile("[0-9,.]+") filtered_words = list( filter( # lambda x: len(x) > 3 or not hiragana_re.fullmatch(x), lambda x: (len(x) > 3 if hiragana_re.fullmatch(x) else len(x) > 1) and not number_re.fullmatch(x), # map(lambda x: x.normalized_form, filter(lambda x: x.postag in ["名詞", "動詞"], words)), map(lambda x: x.surface, filter(lambda x: x.postag in ["名詞"], words)), )) num_words = len(filtered_words) word_count = Counter(filtered_words) word_freq_list = list( map(lambda k: (k, word_count[k] / num_words), word_count)) word_freq = dict(word_freq_list) tf_tsv_key = get_tf_tsv_key(id_) tf_tsv = "\n".join(map(lambda x: "\t".join(map(str, x)), word_freq_list)) private_bucket.put_object(Body=tf_tsv.encode("utf-8"), Key=tf_tsv_key) return word_freq
async def batch_tokenize(params: TokenizeParameter, request: Request): if params.texts is None: raise HTTPException(status_code=400, detail="texts is required.") cache_key = generate_cache_key(params) if cache_key in request.app.state.cache: logging.info(f"Hit cache: {cache_key}") tokenizer = request.app.state.cache[cache_key] else: logging.info(f"Create tokenizer: {cache_key}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, user_dictionary_path=params.user_dictionary_path, system_dictionary_path=params.system_dictionary_path, model_path=params.model_path, mode=params.mode, dictionary_format=params.dictionary_format, ) request.app.state.cache[cache_key] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") tokens_list = [[token.dict() for token in tokenizer.tokenize(text)] for text in params.texts] return {"tokens_list": tokens_list}
def tokenize(params: TokenizeParameter, request: Request): if params.texts is not None: message = ( "A parameter `texts` is now unacceptable for /api/v1/tokenize." " Please use /api/v1/batch_tokenize instead.") raise HTTPException(status_code=400, detail=message) if params.text is None: raise HTTPException(status_code=400, detail="text or texts is required.") cache_key = generate_cache_key(params) if cache_key in request.app.state.cache: logging.info(f"Hit cache: {cache_key}") tokenizer = request.app.state.cache[cache_key] else: logging.info(f"Create tokenizer: {cache_key}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, user_dictionary_path=params.user_dictionary_path, system_dictionary_path=params.system_dictionary_path, model_path=params.model_path, mode=params.mode, dictionary_format=params.dictionary_format, ) request.app.state.cache[cache_key] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") tokens = [token.dict() for token in tokenizer.tokenize(params.text)] return {"tokens": tokens}
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict): tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [ Token.from_dict(token_param) for token_param in read_lines(tokenizer_name)[0] ] result = tokenizer.tokenize(raw_texts[0]) assert expect == result
def test_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict): if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7): pytest.skip("KyTea doesn't work in Python3.6") tokenizer_name = tokenizer_params["tokenizer"] tokenizer = WordTokenizer(**tokenizer_params) expect = [ Token.from_dict(token_param) for token_param in read_lines(tokenizer_name)[0] ] result = tokenizer.tokenize(raw_texts[0]) assert expect == result
def preprocess_data(df): # split df['text'] = df['text'].apply(neologdn.normalize) le = preprocessing.LabelEncoder() df['label'] = le.fit_transform(df['label']) df_train, df_test, y_train, y_test = train_test_split(df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) # tokenize tokenizer = WordTokenizer('MeCab') docs_train = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text'] ]) docs_test = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text'] ]) # tfidf: Don't use df_test for fitting count_vec = CountVectorizer(min_df=2, max_features=20000, ngram_range=(1, 3)) bags_train = count_vec.fit_transform(docs_train) bags_test = count_vec.transform(docs_test) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) tf_idf_train = tfidf.fit_transform(bags_train) tf_idf_test = tfidf.transform(bags_test) X_train = pd.DataFrame(tf_idf_train.toarray()) X_test = pd.DataFrame(tf_idf_test.toarray()) return X_train.reset_index(drop=True), X_test.reset_index( drop=True), y_train, y_test
def tokenize(params: TokenizeParameter, request: Request): if params.text is not None: texts = [params.text] elif params.texts is not None: texts = params.texts else: raise HTTPException(status_code=400, detail="text or texts is required.") mode = params.mode.lower() model_path = ("data/model.spm" if params.tokenizer.lower() == "sentencepiece" else None) # NOQA signature = f"{params.tokenizer}.{model_path}.{mode}" if signature in request.app.tokenizers: logging.info(f"Hit cache: {signature}") tokenizer = request.app.tokenizers[signature] else: logging.info(f"Create tokenizer: {signature}") try: tokenizer = WordTokenizer( tokenizer=params.tokenizer, with_postag=True, model_path=model_path, mode=mode, ) request.app.tokenizers[signature] = tokenizer except Exception: raise HTTPException(status_code=400, detail="fail to initialize tokenizer") results = [[{ "surface": t.surface, "part_of_speech": t.postag } for t in tokenizer.tokenize(text)] for text in texts] return {"tokens": results}
" has been trained with n_features=%d" % ( n_features, expected_n_features)) X = X * self._idf_diag return X if __name__ == '__main__': df = load_pandas_df(nrows=10) # Normalization df['text'] = df['text'].apply(neologdn.normalize) tokenizer = WordTokenizer('MeCab') docs = np.array([ ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] ]) print(docs.shape) # (10,) count_vec = CountVectorizer(min_df=2, max_features=20000, ngram_range=(1, 3)) bags = count_vec.fit_transform(docs) print(bags.toarray().shape) print(bags.toarray()) """ (10, 445) [[1 0 1 ... 0 0 0] [1 0 0 ... 0 0 0]
try: _tokenizer = WordTokenizer("Sudachi", mode="A", with_postag=True) word_tokenizers.append(_tokenizer) except (ImportError, KeyError): print("Skip: ", "Sudachi") print("Finish creating word tokenizers") print() # ref: https://ja.wikipedia.org/wiki/東京特許許可局 document = "東京特許許可局(とうきょうとっきょきょかきょく) 日本語の早口言葉。" # NOQA document += "なお実際に特許に関する行政を行うのは特許庁であり、過去にこのような役所が存在したことは一度も無い。" # NOQA print("Given document: {}".format(document)) sentences = sentence_tokenizer.tokenize(document) for sentence_id, sentence in enumerate(sentences): print("#{}: {}".format(sentence_id, sentence)) for word_tokenizer in word_tokenizers: print("Tokenizer: {}".format(word_tokenizer.name)) result = word_tokenizer.tokenize(sentence) result = [str(r) for r in result] print(" ".join(result)) print() word_tokenizer = WordTokenizer("whitespace") sentence = "私 は 猫 だ ニャン" print(word_tokenizer.tokenize(sentence))
from konoha import WordTokenizer from utils_nlp.dataset.livedoor import load_pandas_df if __name__ == '__main__': df = load_pandas_df(nrows=10) text = df['text'][0][:30] print(text) # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン tokenizer_m = WordTokenizer('MeCab') print(tokenizer_m.tokenize(text)) # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ?, もうすぐ, ジューン] tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True) print(tokenizer_s.tokenize(text)) # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ? (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)] df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']] print(df.head())
from konoha import WordTokenizer import neologdn import numpy as np from utils_nlp.dataset.livedoor import load_pandas_df from utils_nlp.features import scdv from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors if __name__ == '__main__': df = load_pandas_df(nrows=10) # Normalization df['text'] = df['text'].apply(neologdn.normalize) tokenizer = WordTokenizer('MeCab') docs = np.array( [map(str, tokenizer.tokenize(text)) for text in df['text']]) print(docs.shape) # (10,) word_vec = load_pretrained_vectors('data') scdv = scdv.create(docs, word_vec, n_components=10) print(scdv.shape) # (10, 3000)