Python train_unsupervised примеры, fasttext.train_unsupervised Python примеры использования

Пример #1

0

Показать файл

def generate_fasttext(corpus,text_filepath,emb_path,cbow=False,min_count=2,minn=3, maxn=5, dim=200,epochs=5,lr=.1,neg=5,ws=5):

    try:
        os.makedirs(emb_path)
    except OSError:
        pass

    try:
        os.makedirs(text_filepath)
    except OSError:
        pass

    if type(corpus[0]) == list:
        corpus = [" ".join(i) for i in corpus]

    df = pd.DataFrame()
    df['text'] = corpus
    df.to_csv(os.path.join(text_filepath,'file.txt'),header=False,index=False)
    
    if cbow:
        model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), "cbow", minCount=min_count, minn=minn, maxn=maxn, dim=dim, \
                                                epoch=epochs, lr=lr, ws=ws, neg=neg)

    else:
        model = fasttext.train_unsupervised(os.path.join(text_filepath,'file.txt'), os.path.join(emb_path,'ft'), minCount=min_count, minn=minn, maxn=maxn, dim=dim, \
                                                epoch=epochs, lr=lr, ws=ws, neg=neg)

Пример #2

0

Показать файл

Файл: JointlyME.py Проект: AllenWangle/SemNE

def learn_embeddings(mode, sentences, dimensions, window_size, workers, iter,
                     ind):
    """
    Jointly Learn word-level and fact-level embeddings by optimizing the Language Model.

    :param ind: the index for each fact
    :param mode: the chosen language model
    :param sentences: the sequence sampled by node2vec
    :param dimensions: the number of dimensions
    :param window_size: the size of window in language model
    :param workers: the number of parnell threads
    :param iter: the number of epochs in SGD.
    :return: the word-level (model_W) and the fact-level (model_S) model
    """
    np.savetxt(sen_file_path, np.array(sentences), fmt="%s", newline="\n")

    if mode == "skipgram":
        print("                    +++Learning Word-level Embeddings++++")
        wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions)
        wm.save_model(word_model_path + "_" + mode + ".bin")

        print("                    +++Learning Fact-level Embeddings++++")
        sent = list(list(map(str, s)) for s in sentences)
        fm = Word2Vec(sent,
                      size=dimensions,
                      window=window_size,
                      min_count=0,
                      sg=1,
                      workers=workers,
                      iter=iter)
        fm.wv.save_word2vec_format(fact_embedding_path, binary=False)
        fm.save(fact_model_path + "_" + mode + ".bin")

        # Turn fact into corresponding nodes
        semantic_to_fact(ind, dimensions)

        return wm

    if mode == "cbow":
        print("                    +++Learning Word-level Embeddings++++")
        wm = ft.train_unsupervised(sen_file_path, model=mode, dim=dimensions)
        wm.save_model(word_model_path + "_" + mode + ".bin")

        print("                    +++Learning Fact-level Embeddings++++")
        sent = list(list(map(str, s)) for s in sentences)
        fm = Word2Vec(sent,
                      size=dimensions,
                      window=window_size,
                      min_count=0,
                      sg=0,
                      workers=workers,
                      iter=iter)
        fm.wv.save_word2vec_format(fact_embedding_path, binary=False)
        fm.save(fact_model_path + "_" + mode + ".bin")

        # Turn fact into corresponding nodes
        semantic_to_fact(ind, dimensions)

        return wm

Пример #3

0

Показать файл

def train_fasttext_model():
    model = ft.train_unsupervised('./twitter_corpora/corpora.txt',
                                  model='skipgram',
                                  dim=45)
    model.save_model('./fasttext/sk_fasttext.bin')

    model = ft.train_unsupervised('./twitter_corpora/corpora.txt',
                                  model='cbow',
                                  dim=45)
    model.save_model('./fasttext/cbow_fasttext.bin')

Пример #4

0

Показать файл

Файл: FeatureEngineeringModels.py Проект: kostantinos-papadamou/pseudoscience-paper

 def finetune_model(self, model_type, overwrite=False):
     """
     Method that trains an unsupervised fasttext model on our dataset for the given
     Video metadata type and stores is so that it can be used during the training of
     the Pseudoscience Classifier for extracting the embeddings from the input features
     :param model_type: 'video_snippet', 'video_tags', 'video_transcript', or 'video_comments'
     :param overwrite: whether to retrain and overwrite existing saved fastText model (if exists)
     :return:
     """
     # Create fastText input data filename
     fasttext_model_filename = '{0}/fasttext_model_{1}.bin'.format(
         self.FEATURE_ENGINEERING_MODELS_DIR, model_type)
     if not os.path.isfile(fasttext_model_filename) or overwrite:
         # Train unspervised fastText model
         model = fasttext.train_unsupervised(
             input='{0}/{1}_train_data.txt'.format(self.DATA_DIR,
                                                   model_type),
             pretrainedVectors='wiki-news-300d-1M.vec',
             dim=300,
             minn=2,
             maxn=5,
             thread=multiprocessing.cpu_count() -
             1,  # run in multiple cores
             verbose=2)
         # Save trained model
         model.save_model(fasttext_model_filename)
     return

Пример #5

0

Показать файл

Файл: data_utils.py Проект: mrvoh/HA-CapsNet

def embeddings_from_docs(
    in_path,
    out_path,
    fasttext_path=None,
    word_vec_dim=300,
    min_count=5,
    n_epoch=20,
    minn=3,
    maxn=5,
    lr=0.05,
):
    # Read in docs
    with open(in_path, "rb") as f:
        docs = pickle.load(f)

    # Write docs to temporary *.txt file for fasttext to train on
    with open("tmp.txt", "w", encoding="utf-8") as f:
        for doc in docs:
            f.write("\n".join(
                [" ".join([word for word in sen]) for sen in doc.sentences]))

    # Train word embeddings
    model = fasttext.train_unsupervised(
        "tmp.txt",
        dim=word_vec_dim,
        minCount=min_count,
        epoch=n_epoch,
        minn=minn,
        maxn=maxn,
        lr=lr,
    )

    model.save_model(out_path)

Пример #6

0

Показать файл

Файл: similar_abstract_mongodb.py Проект: COVID-19-Text-Mining/DBProcessingScripts

    def train(self):
        """
        update the language model
        """
        self.model = None  # remove the old model (for saving memory)

        current_time = datetime.datetime.now()
        file_name = "fasttext_{hash_code}_{year}_{month}_{day}".format(hash_code=abs(hash(current_time)),
                                                                       year=current_time.year,
                                                                       month=current_time.month,
                                                                       day=current_time.day)
        tmp_path = os.path.join(self.tmp_dir, file_name)

        # make corpus
        logger.info("Starting to build corpus for training, tmp file: {}".format(tmp_path))
        with open(tmp_path, "w", encoding="utf-8") as f:
            for doc in self.db[self.collection].find({self.abstract_entry: {"$exists": True, "$ne": None}}):
                tokens = PreTokenize.tokenize(doc.get(self.abstract_entry, ""), True)
                if tokens:
                    f.write(" ".join(tokens)+"\n")

        logger.info("Training the model -- Arguments: {}".format(self.training_args))
        model = fasttext.train_unsupervised(input=tmp_path, **self.training_args)
        model.save_model(self.model_path)
        self.model = model  # load new model

        # delete the tmp file
        os.remove(tmp_path)
        logger.info("Successfully save the new model and remove tmp file")
        self.db.metadata.update_one(
            {"data": "last_word_embedding_trained"}, {"$set": {"datetime": datetime.datetime.now()}}
        )

Пример #7

0

Показать файл

Файл: app.py Проект: zhengdeding/opyrator

def train_word_vectors(input: WordVectorTrainingInput) -> WordVectorTrainingOutput:
    """Trains word vectors via [FastText](https://fasttext.cc) based on a provided text."""

    with NamedTemporaryFile(suffix=".txt", mode="w", encoding="utf-8") as f:
        f.write(input.text)
        f.seek(0)

        model = fasttext.train_unsupervised(
            f.name,
            model=input.model.value,
            lr=input.learning_rate,
            dim=input.dimension,
            epoch=input.epoch,
            minCount=input.min_count,
            loss=input.loss_function,
            thread=1,  # only train with one thread to not block other demos
        )

        with NamedTemporaryFile(suffix=".vec", mode="w+b") as vec_file:
            words = model.get_words()
            for word in words:
                vec_file.write(
                    str.encode(
                        word
                        + "".join(" " + str(vi) for vi in model.get_word_vector(word))
                        + "\n"
                    )
                )
            vec_file.seek(0)
            return WordVectorTrainingOutput(vector_file=vec_file.read())

Пример #8

0

Показать файл

def create_model(texts):
    temp_file = "temp.txt"
    with open(temp_file, "w") as f:
        f.write(texts.str.cat(sep='\n'))
    model = fasttext.train_unsupervised(temp_file, minn=2, maxn=5, dim=100)
    os.remove(temp_file)
    return model

Пример #9

0

Показать файл

def create_fasttext_embedding_matrix(
        file_path: str, vocab: typing.Dict[str, int],
        embedding_dim: int) -> typing.Dict[str, np.ndarray]:
    """Train a fasttext model and return the embeddings."""

    model_path = os.path.join(SHARED_PATH, 'embedding_models',
                              f'fasttext_model_dim_{embedding_dim}.bin')

    if os.path.exists(model_path):
        logger.info('Loading fasttext embeddings...')
        model = fasttext.load_model(model_path)
    else:
        logger.info('Training fasttext embeddings...')
        model = fasttext.train_unsupervised(file_path,
                                            model='skipgram',
                                            dim=embedding_dim)
        model.save_model(model_path)

    embedding_matrix = np.zeros((len(vocab), model.get_dimension()))
    for word in vocab.keys():
        idx = vocab[word]
        if word in model.words:
            embedding_matrix[idx] = model[word]
        else:
            pass  # If word embedding is unknown, vector of zeros

    return embedding_matrix

Пример #10

0

Показать файл

Файл: model_train.py Проект: GuanRunwei/Intelligent-customer-service

def train_model():
    model = fasttext.train_unsupervised(input="wiki_cut_word.txt",
                                        model="skipgram",
                                        ws=6,
                                        minn=2,
                                        thread=12)
    model.save_model("fasttext.wiki.model.bin")

Пример #11

0

Показать файл

Файл: main.py Проект: solidate/keywords2vec

def train_model(input_filename):
    model = fasttext.train_unsupervised(input_filename,
                                        model='skipgram',
                                        maxn=0,
                                        dim=100,
                                        ws=5)
    return model

Пример #12

0

Показать файл

Файл: train_embedding.py Проект: MarTnquesada/tfg

def train_facebook_fasttext_embedding(data,
                                      emb_nm,
                                      minn=3,
                                      maxn=6,
                                      dim=100,
                                      epoch=5,
                                      lr=0.05,
                                      thread=4,
                                      max_vocab_size=200000):
    # unsupervised training with custom parameters
    emb = fasttext.train_unsupervised(data,
                                      minn=minn,
                                      maxn=maxn,
                                      dim=dim,
                                      epoch=epoch,
                                      lr=lr,
                                      thread=thread)

    # we only select the vocab_size most frequent terms
    # TODO this should probably be emb.words = [:max_vocab_size]. Use Gensim to change format and reduce size
    # TODO ref: https://medium.com/@vasnetsov93/shrinking-fasttext-embeddings-so-that-it-fits-google-colab-cd59ab75959e
    # del emb.words[max_vocab_size:]

    # saving trained model
    emb.save_model(emb_nm)

Пример #13

0

Показать файл

def create_fasttext_model(labels):
    """Runs Fastettext unsupervised to create a good model based on training set"""
    create_text_file(labels)
    model = fasttext.train_unsupervised('data_raw.txt',
                                        model='skipgram',
                                        dim=15)
    model.save_model("model_text_raw.bin")

Пример #14

0

Показать файл

def get_model(model_path: str, train_data_path: str):
    try:
        model = fasttext.load_model(model_path)
    except ValueError:
        model = fasttext.train_unsupervised(train_data_path, model='skipgram')
        model.save_model(model_path)
    return model

Пример #15

0

Показать файл

Файл: generate_embedding.py Проект: TryFire/Internship-Limsi

def fasttext_train_unsupervised(bpe_file,
                                nwords,
                                outf,
                                dim=300,
                                minCount=1,
                                minn=10,
                                maxn=10):
    words = set()
    with open(bpe_file, 'r') as f:
        for line in f:
            for w in line.split():
                if w not in words:
                    words.add(w)
    print('training word vecs by fasttext')
    model = fasttext.train_unsupervised(bpe_file,
                                        dim=dim,
                                        minCount=minCount,
                                        minn=minn,
                                        maxn=maxn)
    print('OK! training finished')

    words_vec = np.zeros((nwords, dim))
    for i in range(nwords):
        if str(i) in words:
            words_vec[i, :] = model.get_word_vector(str(i))

    np.savetxt(outf, words_vec, delimiter=',')
    print('OK! model saved to %s ' % (outf))

Пример #16

0

Показать файл

Файл: FastTextEncoder.py Проект: philipphager/faceted-domain-encoder

 def fit(self, config):
     if self.pretrained:
         path = hydra.utils.to_absolute_path(config.word.embedding)
         self.model = fasttext.load_model(path)
     else:
         path = hydra.utils.to_absolute_path(config.data.train_path)
         self.model = fasttext.train_unsupervised(path, dim=self.dimensions)

Пример #17

0

Показать файл

Файл: data_cli.py Проект: j5bd/q

def train_fasttext(hf_dataset, output_dir):
    """

    Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output

    :return:
    """

    tokens_fp = os.path.join(output_dir, 'tokens.txt')
    fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin')
    fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt')

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')

    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Tokenized text
    doc_delimiter = '\n'
    token_delimiter = ' '
    tokens_count = 0

    with open(tokens_fp, 'w') as f:
        for doc in docs_ds:
            # Extract plain text
            text = doc['title'] + ': ' + doc['abstract']

            for token in gensim.utils.simple_preprocess(text,
                                                        min_len=2,
                                                        max_len=15):
                f.write(token + token_delimiter)
                tokens_count += 1
            f.write(doc_delimiter)

    logger.info(f'Total tokens: {tokens_count:,}')

    # Train actual fasttext model
    logger.info(f'Train fastext model...')

    model = fasttext.train_unsupervised(
        tokens_fp,
        model='skipgram',
        lr=0.05,  # learning rate [0.05]
        dim=300,  # size of word vectors [100]
        ws=5,  # size of the context window [5]
        epoch=5  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(fasttext_bin_fp)

    del model

    ft_model = FastText.load_fasttext_format(fasttext_bin_fp)
    ft_model.wv.save_word2vec_format(fasttext_w2v_fp)

    logger.info(f'Output saved to: {fasttext_w2v_fp}')

    logger.info('Done')

Пример #18

0

Показать файл

def train_w2v(sentences, model='skipgram', dim=200, min_count=20, lr=0.015, ws=7, minn=3, maxn=6, epoch=20):
    """train word2vec ( via ``fasttext.unsupervised`` ).

    Args:
        sentences (list-like): list of raw sentences.
        model (str): model name (options are: 'skipgram' and 'cbow').
        dim (int): embedding size. default is 200.
        min_count (int): filter words with less than ``min_count`` occurrences.
        lr (float): learning rate.
        ws (int): window-size.
        minn (int): subword min length (default: 3-char).
        maxn (int): subword max length (default: 6-char).
        epoch (int): num of training epochs.

    Returns:
        ``fasttext.FastText._FastText``
    """
    with tempfile.NamedTemporaryFile(mode='w', prefix='corpus-', suffix='.txt') as f:
        for raw_sentence in sentences:
            f.write(raw_sentence)
            f.write('\n')

        return fasttext.train_unsupervised(input=f.name,
                                           model=model,
                                           dim=dim,
                                           minCount=min_count,
                                           lr=lr,
                                           epoch=epoch,
                                           ws=ws,
                                           minn=minn,
                                           maxn=maxn)

Пример #19

0

Показать файл

	def format_data_BRAND(self,blog_file,data_file,data_vec_file):
		
		data = load_data(data_file)
		n=data['y_h'].shape[0]
		# print n
		# return 
		# only_txt = {i:data[i]['txt'] for i in data['data'].keys()}
		# self.dict_to_txt(only_txt, blog_file)		
		model = fasttext.train_unsupervised(blog_file, model='skipgram')
		data_vec={'y':np.zeros(n),'c':np.zeros(n)}
		x=[]
		for tid,y_h in zip(data['data'].keys(),data['y_h']):
			blog = data['data'][tid]['txt'].replace('\n',' ').decode('utf-8')
			print blog
			print '****************************************************'
			x.append(model.get_sentence_vector(blog).flatten() )
			# print y_h
			data_vec['y'][int(tid)] = np.mean(y_h)
			data_vec['c'][int(tid)] = np.mean((y_h-np.mean(y_h))**2)*0.01
		# return 
		plt.plot(data_vec['y'],label='y')
		plt.plot(data_vec['c'],label='c')
		plt.legend()
		plt.show()
		data_vec['x']=np.array(x)

		save(data_vec, data_vec_file)

Пример #20

0

Показать файл

def get_parameter_value_with_results(i, param, param_values, params_wordembeddings, params_training, tune, X_test, y_test):
    print(str(i))
    model_name = "test_" + param + "_" + str(i)
    # bin_path = "word_vectors/fasttext/" + model_name + ".bin" 
    vec_path = "word_vectors/fasttext/" + model_name + ".vec" 
    if tune == "wordembeddings": ####### tuning parameter for fasttext WORD EMBEDDING
        params_wordembeddings[param] = param_values[i]
    embeddings = fasttext.train_unsupervised(input='data.txt', model='skipgram', **params_wordembeddings) 
    # embeddings.save_model(bin_path)
    # embeddings = load_model(bin_path)
    ### convert from fasttext embeddings (would be saved as .bin) to .vec,
    #   in order to use the embeddings .vec file as pretrainedVectors for fasttext text classification
    from_bin_to_vec(embeddings, vec_path)
    if tune == "training": ####### tuning parameter for fasttext TRAINING
        params_training[param] = param_values[i]
    # dimension of embeddings has to fit with dimension of look-up table (embeddings) in training model
    params_training["dim"] = embeddings.get_dimension()
    trained_model = fasttext.train_supervised(input=train_file, pretrainedVectors= vec_path, **params_training)
    ### find and apply optimal (threshold) cutoff point
    # get scores, i.e. list of probabilities for being labeled positive on set X_test
    y_scores = get_prediction_scores(trained_model,X_test)
    # find optimal probability threshold
    opt_threshold = find_optimal_cutoff(y_test, y_scores)
    # apply optimal threshold to the prediction probability and get label predictions
    y_pred = get_predictions(opt_threshold, y_scores) 
    ################## Evaluation
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    auprc = metrics.average_precision_score(y_test, y_pred)
    return [accuracy, precision, recall, auc, auprc]

Пример #21

0

Показать файл

def train_unsupervised(args):

    # https://fasttext.cc/docs/en/unsupervised-tutorial.html
    model = fasttext.train_unsupervised(args.input,
                                        lr=args.lr,
                                        minCount=args.min_count,
                                        epoch=args.epoch,
                                        minn=args.minn,
                                        maxn=args.maxn,
                                        dim=args.dim,
                                        ws=args.ws)

    if not os.path.isdir(args.output_dir):
        print(f'Creating output directory: {args.output_dir}')
        os.makedirs(args.output_dir)

    model_fname = os.path.join(args.output_dir, 'model.bin')
    print(f'Saving model to: {model_fname}')
    model.save_model(model_fname)

    vec_fname = os.path.join(args.output_dir, f'word-vectors-{args.dim}d.txt')
    print(f'Saving word vectors to: {vec_fname}')
    bin_to_vec(model, vec_fname)

    count_fname = os.path.join(args.output_dir, f'word-counts.txt')
    print(f'Saving word count to: {count_fname}')
    bin_to_word_count(model, count_fname)

Пример #22

0

Показать файл

Файл: word_vectors.py Проект: f4g2/i

def train_fasttext(data_dir='./data',
                   dim=300,
                   epoch=5,
                   ft_model='skipgram',
                   ft_lr=0.05,
                   ft_window=5):

    data_dir = Path(data_dir)

    import fasttext

    model = fasttext.train_unsupervised(
        str(data_dir / 'ocb_and_wikisource.w2v_tokens.txt'),
        model=ft_model,
        lr=ft_lr,  # learning rate [0.05]
        dim=dim,  # size of word vectors [100]
        ws=ft_window,  # size of the context window [5]
        epoch=epoch  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    from gensim.models.wrappers import FastText

    ft_model = FastText.load_fasttext_format(
        str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    ft_model.wv.save_word2vec_format(data_dir /
                                     'ocb_and_wikisource.fasttext.w2v.txt')

    logger.info('done')

Пример #23

0

Показать файл

Файл: fasttext_msg_classify.py Проект: IvanEvan/mobile-phone-message-classification

    def w2v_train(self, documents_input, w2v_model_output):  # 预训练词向量并保存
        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : create word-segment without label txt')
        documents_cut = 'cache/msg_seg_without_label.txt'
        self.DP.file_cut_words(documents_input, documents_cut, mode='vec')

        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : w2v train start')
        # skipgram模型训练生成词向量，结果输出到w2v_model_output：lr学习率，dim维数，min_count最小词频
        model = fasttext.train_unsupervised(documents_cut,
                                            model='skipgram',
                                            lr=0.05,
                                            dim=self.dim,
                                            loss=self.loss,
                                            word_ngrams=self.word_ngrams,
                                            min_count=self.min_count)
        model.save_model(w2v_model_output)

        # os.remove(documents_cut)
        print(
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) +
            ' : w2v train done')

        return model

Пример #24

0

Показать файл

    def test_as_array_produces_token_array() -> None:

        with tempfile.TemporaryDirectory() as tempdir:
            dataset_filename = f"{tempdir}/dataset.txt"
            pretrained_filename = f"{tempdir}/fasttext.model"

            with open(dataset_filename, "w") as fp:
                fp.write("\n".join(
                    ["this is a first sentence", "this is a second sentence"]))

            model = fasttext.train_unsupervised(
                dataset_filename,
                model="skipgram",
                dim=10,
                minCount=1,
            )
            model.save_model(pretrained_filename)

            indexer = FastTextTokenIndexer(
                pretrained_filename=pretrained_filename)
            tokens = [
                Token(word) for word in "this is a test sentence".split()
            ]
            field = TextField(tokens, token_indexers={"tokens": indexer})

            vocab = Vocabulary()
            field.index(vocab)

            array_dict = indexer.tokens_to_indices(tokens, vocab)
            assert len(array_dict["tokens"]) == 5
            assert len(array_dict["tokens"][0]) == 10

Пример #25

0

Показать файл

 def generate_embedding(self):
     classifier = fasttext.train_unsupervised(input=self.train_file,
                                              dim=self.vec_dim,
                                              epoch=self.epoch,
                                              minCount=10,
                                              thread=10)
     return self.get_res(classifier)

Пример #26

0

Показать файл

 def train_fasttext(self, data, model_name, epoch):
     if self.is_train:
         model = fasttext.train_unsupervised(data,
                                             model='skipgram',
                                             minCount=1,
                                             epoch=epoch)
         model.save_model(model_name)

Пример #27

0

Показать файл

    def build(data, size, mincount, path):
        """
        Builds fastText vectors from a file.

        Args:
            data: path to input data file
            size: number of vector dimensions
            mincount: minimum number of occurrences required to register a token
            path: path to output file
        """

        # Train on data file using largest dimension size
        model = fasttext.train_unsupervised(data, dim=size, minCount=mincount)

        # Output file path
        print("Building %d dimension model" % size)

        # Output vectors in vec/txt format
        with open(path + ".txt", "w") as output:
            words = model.get_words()
            output.write("%d %d\n" % (len(words), model.get_dimension()))

            for word in words:
                # Skip end of line token
                if word != "</s>":
                    vector = model.get_word_vector(word)
                    data = ""
                    for v in vector:
                        data += " " + str(v)

                    output.write(word + data + "\n")

        # Build magnitude vectors database
        print("Converting vectors to magnitude format")
        converter.convert(path + ".txt", path + ".magnitude", subword=True)

Пример #28

0

Показать файл

Файл: fasttext_embeddings.py Проект: thiagordp/embeddings_in_law_paper

def generate_fasttext_skipgram(data_file, train_iter, emb_size, output_file):
    model = fasttext.train_unsupervised(input=data_file, model='skipgram', dim=emb_size, minCount=5, verbose=2,
                                        thread=8)
    model_output = output_file.replace(".txt", ".bin")
    text_output = output_file
    model.save_model(model_output)
    fasttext_to_text.export_to_file(model_output, text_output)

Пример #29

0

Показать файл

 def __generate_embeddings(self, file_path):
     self.printer.print('generating fasttext term embeddings')
     tmp_file = os.path.join(self.args.local_dir, 'tmp')
     with open(tmp_file, 'w', encoding='utf8') as f_out:
         with open(os.path.join(self.args.local_dir,
                                self.args.file_in_qs_train),
                   'rt',
                   encoding='utf8') as f_in:
             reader = csv.reader(f_in, delimiter='\t')
             for [_, q] in reader:
                 f_out.write(q)
                 f_out.write('\n')
         with open(os.path.join(self.args.local_dir,
                                self.args.file_in_docs),
                   'rt',
                   encoding='utf8') as f_in:
             reader = csv.reader(f_in, delimiter='\t')
             for row in reader:
                 f_out.write('\n'.join(row[1:]))
                 f_out.write('\n')
     self.printer.print('training fasttext term embeddings')
     embeddings = fasttext.train_unsupervised(
         tmp_file,
         model='skipgram',
         dim=self.args.num_hidden_nodes // 2,
         bucket=10000,
         minCount=100,
         minn=1,
         maxn=0,
         ws=10,
         epoch=5)
     embeddings.save_model(file_path)
     os.remove(tmp_file)

Пример #30

0

Показать файл

Файл: embeddings.py Проект: schan27/DMT

def train_fasttext(corpus, cut_func, vocabulary, embedding_dim=300):
    corpus = [' '.join(cut_func(sentence)) for sentence in corpus]
    corpus_file_path = 'fasttext_tmp_corpus.txt'
    with open(corpus_file_path, 'w', encoding='utf8') as writer:
        for sentence in corpus:
            writer.write(sentence + '\n')

    model = train_unsupervised(input=corpus_file_path,
                               model='skipgram',
                               epoch=10,
                               minCount=1,
                               wordNgrams=3,
                               dim=300)

    model_vocab = model.get_words()

    emb = np.zeros(shape=(len(vocabulary) + 1, embedding_dim), dtype='float32')
    nb_unk = 0
    for w, i in vocabulary.items():
        if w not in model_vocab:
            nb_unk += 1
            emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
        else:
            emb[i, :] = model.get_word_vector(w)
    print(
        'Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}'
        .format(emb.shape, nb_unk))
    os.remove(corpus_file_path)
    return emb

Python train_unsupervised примеры использования