Python FastText._FastText 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: fasttext

클래스/타입: FastText._FastText

hotexamples.com에서의 예제들: 6

Python FastText._FastText - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 fasttext.FastText._FastText에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

get_dimension(2)

get_input_matrix(1)

get_sentence_vector(1)

get_word_vector(1)

predict(1)

test(1)

예제 #1

파일 보기

 def build_ft_embeddings(self, pretrained: fasttext.FastText._FastText):
     embedding_list = []
     for index in tqdm(range(len(self))):
         word = self.index2word(index)
         embedding_list.append(pretrained.get_word_vector(word))
     self.embeddings = np.vstack(embedding_list)
     assert self.embeddings.shape == (len(self), pretrained.get_dimension())

예제 #2

파일 보기

파일: expand_vector.py 프로젝트: homemade-hedgehog/fake_Sparse_Composite_Document_Vectors

def expand_token_vector(token: str, model_lda_gensim: LdaMulticore,
                        model_fast_text: fasttext.FastText._FastText,
                        dictionary_lda: Dictionary) -> np.array:
    """
    SCDVの理論と説明は以下を参照
    [本家](https://dheeraj7596.github.io/SDV/)
    [実験記事](https://qiita.com/fufufukakaka/items/a7316273908a7c400868)

    単語分散表現の考案によって、単語の意味を示すベクトルが獲得て着るようになった
    しかし、これは単語レベルであり、文書の意味を示すベクトルの獲得は2019年でも混迷の時代である
    2018年にはbidirectional LSTM を2つ使ったbertという文のベクトル化手法が一世を風靡しているが、
    LSTMは長い系列の初めの方は覚えていないという問題がある。
    また、特に、なんらかの評価付き文書群が手に入らない場合には、何を基準に学習するのかという問題がある。
    SCDVは2017年くらいに発表されてイマイチ市民権を得る前にbertが始まってしまった不遇な手法。
    評価付き文書群がなくても文のベクトルを獲得できる。
    SCDVは単語分散表現空間でのまとまりで重み付けして単語分散表現を拡張するが、
    文書の話題の特徴を加味せずに議論するのが気持ち悪い。
    ここでは、文書の話題を加味しうるLDAを用いて同様のことを実施する（自分で発案した理屈なのである程度怪しい）
    この系譜はgaussian LDAなどにも受け継がれているので、そっちを使っても良いかもしれない
    :param token:
    :param model_lda_gensim: LdaMulticore, LDAモデル
    :param model_fast_text:fasttext, word embedding model
    :param dictionary_lda: Dictionary, LDA構築時に使ったtoken -> token_idの辞書
    :return:
    """
    dimension = model_fast_text.get_dimension()
    num_topics = model_lda_gensim.num_topics

    vector_fast = model_fast_text[token]
    vector_lda = lda_token2vector(token=token,
                                  model_lda_gensim=model_lda_gensim,
                                  dictionary_lda=dictionary_lda)
    vector = vector_lda.reshape(-1, 1).dot(vector_fast.reshape(1,
                                                               -1)).flatten()
    return vector

예제 #3

파일 보기

 def __init__(
     self,
     fasttextmodel: fasttext.FastText._FastText,
     special_tokens: Optional[Iterable[str]] = None,
 ):
     super().__init__()
     self.fasttextmodel = fasttextmodel
     weights = torch.from_numpy(fasttextmodel.get_input_matrix())
     # Note: `vocab_size` is the size of the actual fasttext vocabulary. In pratice, the
     # embeddings here have two more tokens in their vocabulary: one for padding (embedding fixed
     # at 0, since the padding embedding never receive gradient in `nn.Embedding`) and one for
     # the special (root) tokens, with values sampled accross the vocabulary
     self.vocab_size: Final[int] = weights.shape[0]
     self.embedding_size: Final[int] = weights.shape[1]
     # NOTE: I haven't thought too hard about this, maybe it's a bad idea
     root_embedding = weights[
         torch.randint(high=self.vocab_size, size=(self.embedding_size, )),
         torch.arange(self.embedding_size), ].unsqueeze(0)
     weights = torch.cat((weights, torch.zeros(
         (1, self.embedding_size)), root_embedding),
                         dim=0).to(torch.float)
     weights.requires_grad = True
     self.embeddings = nn.Embedding.from_pretrained(
         weights, padding_idx=self.vocab_size)
     self.special_tokens: Set = set(
         [] if special_tokens is None else special_tokens)
     self.special_tokens_idx: Final[int] = self.vocab_size + 1
     self.pad_idx: Final[int] = self.embeddings.padding_idx

예제 #4

파일 보기

파일: app.py 프로젝트: lin-justin/humor-app

def predict_humor(model: fasttext.FastText._FastText, sentence: str) -> str:
    predictions = model.predict(sentence)
    label = predictions[0][0].split("__label__")[1]
    if label == "not_humorous":
        label = label.replace("_", " ").title()
    else:
        label = label.title()
    confidence = predictions[1][0]
    return "{} ({:.2f}% confident)".format(label, confidence * 100)

예제 #5

파일 보기

파일: model.py 프로젝트: vibhusingh/machine-learning

def score(model: fasttext.FastText._FastText,
          fasttext_file_path: str,
          k: int = 1,
          round_digits: int = 3) -> Tuple[int, float, float, float]:
    """
    Computes the model evaluation score including precision/recall/f1 at k
    for the input file.

    Parameters
    ----------
    model : _FastText
        Trained fasttext model.

    fasttext_file_path : str
        Path to the text file in the fasttext format.

    k : int, default 1
        Ranking metrics precision/recall/f1 are evaluated for top k prediction.

    round_digits : int, default 3
        Round decimal points for the metrics returned.

    Returns
    -------
    num_records : int
        Number of records in the file.

    precision_at_k : float

    recall_at_k : float

    f1_at_k : float
    """

    num_records, precision_at_k, recall_at_k = model.test(
        fasttext_file_path, k)
    f1_at_k = 2 * (precision_at_k * recall_at_k) / (precision_at_k +
                                                    recall_at_k)

    precision_at_k = round(precision_at_k, round_digits)
    recall_at_k = round(recall_at_k, round_digits)
    f1_at_k = round(f1_at_k, round_digits)
    return num_records, precision_at_k, recall_at_k, f1_at_k

예제 #6

파일 보기

def _get_sentence_vectors(model: fasttext.FastText._FastText, doc_path: str):
    text = open(doc_path, 'r').read()
    vector = mean(
        [model.get_sentence_vector(sentence) for sentence in text.split('\n')],
        axis=0)
    return vector