Exemplo n.º 1
0
def update_tf(id_):
    result = gql_client.execute(note_from_id, variable_values={"id": id_})
    note = result["note"]
    html = note["contentHtml"]
    text = html_text.extract_text(html)
    tokenizer = WordTokenizer("sudachi", mode="C", with_postag=True)
    words = tokenizer.tokenize(text)
    hiragana_re = re.compile("[\u3041-\u309F]+")
    number_re = re.compile("[0-9,.]+")
    filtered_words = list(
        filter(
            # lambda x: len(x) > 3 or not hiragana_re.fullmatch(x),
            lambda x: (len(x) > 3 if hiragana_re.fullmatch(x) else len(x) > 1)
            and not number_re.fullmatch(x),
            # map(lambda x: x.normalized_form, filter(lambda x: x.postag in ["名詞", "動詞"], words)),
            map(lambda x: x.surface, filter(lambda x: x.postag in ["名詞"],
                                            words)),
        ))
    num_words = len(filtered_words)
    word_count = Counter(filtered_words)
    word_freq_list = list(
        map(lambda k: (k, word_count[k] / num_words), word_count))
    word_freq = dict(word_freq_list)
    tf_tsv_key = get_tf_tsv_key(id_)
    tf_tsv = "\n".join(map(lambda x: "\t".join(map(str, x)), word_freq_list))
    private_bucket.put_object(Body=tf_tsv.encode("utf-8"), Key=tf_tsv_key)
    return word_freq
Exemplo n.º 2
0
async def batch_tokenize(params: TokenizeParameter, request: Request):
    if params.texts is None:
        raise HTTPException(status_code=400, detail="texts is required.")

    cache_key = generate_cache_key(params)
    if cache_key in request.app.state.cache:
        logging.info(f"Hit cache: {cache_key}")
        tokenizer = request.app.state.cache[cache_key]
    else:
        logging.info(f"Create tokenizer: {cache_key}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                user_dictionary_path=params.user_dictionary_path,
                system_dictionary_path=params.system_dictionary_path,
                model_path=params.model_path,
                mode=params.mode,
                dictionary_format=params.dictionary_format,
            )
            request.app.state.cache[cache_key] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    tokens_list = [[token.dict() for token in tokenizer.tokenize(text)]
                   for text in params.texts]
    return {"tokens_list": tokens_list}
Exemplo n.º 3
0
def tokenize(params: TokenizeParameter, request: Request):
    if params.texts is not None:
        message = (
            "A parameter `texts` is now unacceptable for /api/v1/tokenize."
            " Please use /api/v1/batch_tokenize instead.")
        raise HTTPException(status_code=400, detail=message)

    if params.text is None:
        raise HTTPException(status_code=400,
                            detail="text or texts is required.")

    cache_key = generate_cache_key(params)
    if cache_key in request.app.state.cache:
        logging.info(f"Hit cache: {cache_key}")
        tokenizer = request.app.state.cache[cache_key]
    else:
        logging.info(f"Create tokenizer: {cache_key}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                user_dictionary_path=params.user_dictionary_path,
                system_dictionary_path=params.system_dictionary_path,
                model_path=params.model_path,
                mode=params.mode,
                dictionary_format=params.dictionary_format,
            )
            request.app.state.cache[cache_key] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    tokens = [token.dict() for token in tokenizer.tokenize(params.text)]
    return {"tokens": tokens}
Exemplo n.º 4
0
def test_tokenize(raw_texts: List[str], tokenizer_params: Dict):
    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [
        Token.from_dict(token_param)
        for token_param in read_lines(tokenizer_name)[0]
    ]
    result = tokenizer.tokenize(raw_texts[0])
    assert expect == result
Exemplo n.º 5
0
def test_tokenize_with_character(raw_texts: List[str], tokenizer_params: Dict):
    if tokenizer_params["tokenizer"] == "kytea" and sys.version_info < (3, 7):
        pytest.skip("KyTea doesn't work in Python3.6")

    tokenizer_name = tokenizer_params["tokenizer"]
    tokenizer = WordTokenizer(**tokenizer_params)
    expect = [
        Token.from_dict(token_param)
        for token_param in read_lines(tokenizer_name)[0]
    ]
    result = tokenizer.tokenize(raw_texts[0])
    assert expect == result
Exemplo n.º 6
0
def preprocess_data(df):
    # split
    df['text'] = df['text'].apply(neologdn.normalize)
    le = preprocessing.LabelEncoder()
    df['label'] = le.fit_transform(df['label'])

    df_train, df_test, y_train, y_test = train_test_split(df,
                                                          df['label'].values,
                                                          test_size=0.2,
                                                          random_state=42,
                                                          stratify=df['label'])

    # tokenize
    tokenizer = WordTokenizer('MeCab')
    docs_train = np.array([
        ' '.join(map(str, tokenizer.tokenize(text)))
        for text in df_train['text']
    ])
    docs_test = np.array([
        ' '.join(map(str, tokenizer.tokenize(text)))
        for text in df_test['text']
    ])

    # tfidf: Don't use df_test for fitting
    count_vec = CountVectorizer(min_df=2,
                                max_features=20000,
                                ngram_range=(1, 3))
    bags_train = count_vec.fit_transform(docs_train)
    bags_test = count_vec.transform(docs_test)

    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    tf_idf_train = tfidf.fit_transform(bags_train)
    tf_idf_test = tfidf.transform(bags_test)

    X_train = pd.DataFrame(tf_idf_train.toarray())
    X_test = pd.DataFrame(tf_idf_test.toarray())

    return X_train.reset_index(drop=True), X_test.reset_index(
        drop=True), y_train, y_test
Exemplo n.º 7
0
def tokenize(params: TokenizeParameter, request: Request):
    if params.text is not None:
        texts = [params.text]
    elif params.texts is not None:
        texts = params.texts
    else:
        raise HTTPException(status_code=400,
                            detail="text or texts is required.")

    mode = params.mode.lower()
    model_path = ("data/model.spm" if params.tokenizer.lower()
                  == "sentencepiece" else None)  # NOQA

    signature = f"{params.tokenizer}.{model_path}.{mode}"
    if signature in request.app.tokenizers:
        logging.info(f"Hit cache: {signature}")
        tokenizer = request.app.tokenizers[signature]
    else:
        logging.info(f"Create tokenizer: {signature}")
        try:
            tokenizer = WordTokenizer(
                tokenizer=params.tokenizer,
                with_postag=True,
                model_path=model_path,
                mode=mode,
            )
            request.app.tokenizers[signature] = tokenizer
        except Exception:
            raise HTTPException(status_code=400,
                                detail="fail to initialize tokenizer")

    results = [[{
        "surface": t.surface,
        "part_of_speech": t.postag
    } for t in tokenizer.tokenize(text)] for text in texts]

    return {"tokens": results}
Exemplo n.º 8
0
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            X = X * self._idf_diag

        return X


if __name__ == '__main__':
    df = load_pandas_df(nrows=10)

    # Normalization
    df['text'] = df['text'].apply(neologdn.normalize)

    tokenizer = WordTokenizer('MeCab')
    docs = np.array([
        ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
    ])
    print(docs.shape)
    # (10,)

    count_vec = CountVectorizer(min_df=2,
                                max_features=20000,
                                ngram_range=(1, 3))
    bags = count_vec.fit_transform(docs)

    print(bags.toarray().shape)
    print(bags.toarray())
    """
    (10, 445)
    [[1 0 1 ... 0 0 0]
    [1 0 0 ... 0 0 0]
Exemplo n.º 9
0
    try:
        _tokenizer = WordTokenizer("Sudachi", mode="A", with_postag=True)
        word_tokenizers.append(_tokenizer)

    except (ImportError, KeyError):
        print("Skip: ", "Sudachi")

    print("Finish creating word tokenizers")
    print()

    # ref: https://ja.wikipedia.org/wiki/東京特許許可局
    document = "東京特許許可局(とうきょうとっきょきょかきょく) 日本語の早口言葉。"  # NOQA
    document += "なお実際に特許に関する行政を行うのは特許庁であり、過去にこのような役所が存在したことは一度も無い。"  # NOQA
    print("Given document: {}".format(document))

    sentences = sentence_tokenizer.tokenize(document)
    for sentence_id, sentence in enumerate(sentences):
        print("#{}: {}".format(sentence_id, sentence))

        for word_tokenizer in word_tokenizers:
            print("Tokenizer: {}".format(word_tokenizer.name))
            result = word_tokenizer.tokenize(sentence)
            result = [str(r) for r in result]
            print(" ".join(result))

        print()

    word_tokenizer = WordTokenizer("whitespace")
    sentence = "私 は 猫 だ ニャン"
    print(word_tokenizer.tokenize(sentence))
Exemplo n.º 10
0
from konoha import WordTokenizer

from utils_nlp.dataset.livedoor import load_pandas_df

if __name__ == '__main__':
    df = load_pandas_df(nrows=10)
    text = df['text'][0][:30]
    print(text)
    # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン

    tokenizer_m = WordTokenizer('MeCab')
    print(tokenizer_m.tokenize(text))
    # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ?, もうすぐ, ジューン]

    tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True)
    print(tokenizer_s.tokenize(text))
    # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ? (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)]

    df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']]
    print(df.head())
Exemplo n.º 11
0
from konoha import WordTokenizer
import neologdn
import numpy as np

from utils_nlp.dataset.livedoor import load_pandas_df
from utils_nlp.features import scdv
from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors

if __name__ == '__main__':
    df = load_pandas_df(nrows=10)

    # Normalization
    df['text'] = df['text'].apply(neologdn.normalize)

    tokenizer = WordTokenizer('MeCab')
    docs = np.array(
        [map(str, tokenizer.tokenize(text)) for text in df['text']])
    print(docs.shape)
    # (10,)

    word_vec = load_pretrained_vectors('data')
    scdv = scdv.create(docs, word_vec, n_components=10)
    print(scdv.shape)
    # (10, 3000)