示例#1
0
def fasttext_proc(params):
    line_sentence = LineSentence(config.words_file)
    model = fasttext.FastText(line_sentence, size=params['vector_dim'], window=params['window_size'],
                              min_count=params['min_frequency'], workers=params['workers'], sg=params['use_skip_gram'],
                              hs=params['use_hierarchical_softmax'], negative=params['negative_size'],
                              iter=params['pre_proc_epochs'])
    return model
def train_word2vec(x):
    # 訓練 word to vector 的 word embedding
    #model = word2vec.Word2Vec(x, size=500, window=5, min_count=5, workers=12, iter=10, sg=1)
    model = fasttext.FastText(min_count=5, size=500)
    model.build_vocab(x)
    model.train(x, total_examples=200000, epochs=10)
    return model
def train_w2v(config):
    try:
        print("word2vec train start")
        update_flag = False
        model = fasttext.FastText(size=300, window=5, min_count=1, workers=4)

        with open(config.pos_path) as f:
            for line in f.readlines():
                if update_flag == False:
                    model.build_vocab([line.split(' ')], update=False)
                    update_flag = True
                else:
                    model.build_vocab([line.split(' ')], update=True)

        with open(config.pos_path) as f:

            for line in f.readlines():
                for _ in range(100):
                    model.train(line.split(' '),
                                total_examples=model.corpus_count,
                                epochs=model.epochs)

        os.makedirs(config.embedding_model_path, exist_ok=True)
        model.save(''.join([config.embedding_model_path, '/', 'model']))
        return model

    except Exception as e:
        print(Exception("error on train w2v : {0}".format(e)))
    finally:
        print("word2vec train done")
示例#4
0
def trainModel_fasttext(train_sen, model_output):
    # sentences = GetSentences(file_input)
    # yield sentences
    # sentences = list(sentences)
    word2vec_model = fasttext.FastText(train_sen,
                                       sg=SG,
                                       min_count=MIN_COUNT,
                                       workers=CPU_NUM,
                                       size=VEC_SIZE,
                                       window=CONTEXT_WINDOW)
    word2vec_model.save(model_output)
示例#5
0
    def create_new_model(cls,
                         corpus_path,
                         pmodel_name,
                         epochs=5,
                         pmin_count=10,
                         psize=150,
                         installdir=''):
        """ Creates and trains (and optionally saves) a model using gensim's implementation 
        of the fastText algorithm, and then loads the KeyedVectors associated with that model.
        
        For CREATION/first time training only. To continue training an already existing
        model, use update_model().

        Parameters
        -----------
        corpus_path (str) - path to the corpus you wish to train the model with
        
        pmodel_name (str) - the name to be assigned to the model when saved. Must be unique
        or error will be raised to avoid overwriting an existing model

        epochs (int, optional) - Number of times to iterate over training corpus during training

        pmin_count (int, optional) - Minimum frequency for a word to be used in training

        psize (int, optional) - Size of vectors for training

        Returns:
        -----------
        True if model created/trained, False if could not be created

        Throws
        -----------
        FileNotFoundError - If corpus_path not found
        RuntimeError - If training an already existing model that makes it past first if statement. This
        is because build_vocab raises RuntimeError if building existing vocab without update=True (see update_model)
        """
        if installdir != '':
            model_path = installdir + IKFastTextModeling.__PATH_PREFIX__

        if pmodel_name[-4:] != '.bin':
            pmodel_name = pmodel_name + '.bin'

        if os.path.exists(os.path.join(model_path, pmodel_name)):
            raise FileExistsError(
                "Model named {} already exists, model could not be created".
                format(pmodel_name[:-4]))

        model = ft.FastText(vector_size=psize, sg=1, min_count=pmin_count)

        super().create_new_model(corpus_path, model, epochs)

        ft.save_facebook_model(model,
                               path=os.path.join(model_path, pmodel_name))
        return True
    def train_fasttext(self, data):
        self.logger.info('train fasttext....')
        self.logger.info(f'word vector size is: {self.size}')

        self.model = fasttext.FastText(data,
                                       sg=self.sg,
                                       iter=self.iter,
                                       seed=self.seed,
                                       size=self.size,
                                       window=self.window,
                                       workers=self.workers,
                                       min_count=self.min_count,
                                       word_ngrams=self.word_ngrams)
示例#7
0
def getFastTextModel(train='', load='', modelname='', min_word=200):
    if train != '':
        # train model
        print(train[:10])
        model = fasttext.FastText(sentences=train, min_count=min_word)
        model.save('word_embeddings/fasttext/models/' + modelname +
                   '.model.bin')

        # pickle the entire model to load and resume training later
        return model
    elif load != '':
        model = fasttext.FastText.load('word_embeddings/fasttext/models/' +
                                       load)
        return model
示例#8
0
 def word2vec(self):
     sentences = word2vec.LineSentence(self.ast_file)
     model = fasttext.FastText(sentences,
                               size=para.chunk_len - 1,
                               window=3,
                               min_count=1,
                               iter=10,
                               min_n=3,
                               max_n=6,
                               word_ngrams=0,
                               max_vocab_size=932)
     # model = word2vec.Word2Vec(sentences, size=self.astdim)
     model.save(u"ast.model")
     return model
示例#9
0
def fasttext_model_gensim(sentences):

    # 使用gensim创建fastText模型
    model = fasttext_gensim.FastText(sentences,
                                     size=200,
                                     window=6,
                                     min_count=1,
                                     iter=10,
                                     min_n=3,
                                     max_n=6)
    print(model.wv["体育"])  # 词向量获得的方式
    print(model["体育"])  # 词向量获得的方式
    print(model.wv.word_vec("体育"))  # 词向量获得的方式
    model.save("./fastText1.kpt")
示例#10
0
def model_fasttext(text, params):
    """
    generate a fasttext model from a text (list of sentences)

    :param text: text, as a list of sentences (strings)
    :param params: dictionary of parameter space for word2vec
    :return: trained encoder model for fasttext
    """
    train_text = [clean_text(s).split() for s in text]
    model = fasttext.FastText(**params)
    model.build_vocab(train_text)
    model.train(train_text,
                total_examples=model.corpus_count,
                epochs=model.iter)
    return model
示例#11
0
    def fit(self, documents=None):
        self.documents = documents
        _resumes_words_list = self.__get_sentences_tokens(self.documents)

        self.model_tfidf = TfidfVectorizer()
        self.model_tfidf.fit(self.documents)

        self.model_word2vec = fasttext.FastText(_resumes_words_list,
                                                negative=5,
                                                workers=4,
                                                iter=self.iter,
                                                min_count=self.min_count)
        self.word_vectors = self.model_word2vec.wv.syn0

        self.model_cluster = GaussianMixture(n_components=self.n_components)
        self.model_cluster.fit(self.word_vectors)
 def train_wv(self, merge_seg_data_fpath):
     '''
     训练词向量
     :param merge_seg_data_fpath: str 训练词向量的数据
     :return:
     '''
     # 训练词向量
     if 'word2vec' == self.wv_type:
         self.wv_model = word2vec.Word2Vec(
             LineSentence(merge_seg_data_fpath),
             min_count=self.wv_config['min_count'],
             size=self.wv_config['size'])
     elif 'fasttext' == self.wv_type:
         self.wv_model = fasttext.FastText(
             LineSentence(merge_seg_data_fpath),
             min_count=self.wv_config['min_count'],
             size=self.wv_config['size'])
示例#13
0
    def train(self):
        self.check_no_data()

        pre_trained_model = fasttext.FastText(seed=self.seed,
                                              sg=self.sg,
                                              alpha=self.alpha,
                                              size=self.size,
                                              window=self.window,
                                              min_count=self.min_count,
                                              min_n=self.min_n,
                                              max_n=self.max_n,
                                              iter=self.iter)
        pre_trained_model.build_vocab(sentences=self.sentences)
        pre_trained_model.train(sentences=self.sentences,
                                total_examples=pre_trained_model.corpus_count,
                                epochs=5)

        self.model = pre_trained_model
def fasttext_embeddings(Y, notes_file, embedding_size, min_count, n_iter):
    modelname = "processed_%s.fasttext" % (Y)
    sentences = ProcessedIter(Y, notes_file)

    model = fasttext.FastText(size=embedding_size,
                              min_count=min_count,
                              iter=n_iter)
    print("building fasttext vocab on %s..." % (notes_file))

    model.build_vocab(sentences)
    print("training...")
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)
    out_file = '/'.join(notes_file.split('/')[:-1] + [modelname])
    print("writing embeddings to %s" % (out_file))
    model.save(out_file)
    return out_file
示例#15
0
def train_fast_text(data_paths="",
                    data=None,
                    model_paths="",
                    model_save_path="",
                    epochs=1,
                    option="create"):
    model = None
    start = time.time()
    if option == "load":
        print('Loading FastText model...')
        model = fasttext.FastText.load(model_paths)
    else:
        print("Paths reads", len(data_paths))
        if option == "create":
            print('Creating FastText model...')
            model = fasttext.FastText(size=300,
                                      window=10,
                                      sg=1,
                                      sample=1e-5,
                                      workers=multiprocessing.cpu_count(),
                                      callbacks=[BatchLogger(model_save_path)])

            model.build_vocab(NextSentMem(data))
            print("Vocabulary is builded!",
                  time.time() - start, len(model.wv.vocab))
            model.train(NextSentMem(data),
                        epochs=epochs,
                        total_examples=model.corpus_count,
                        compute_loss=True,
                        report_delay=1.0,
                        callbacks=[BatchLogger(model_save_path)])
            model.save(model_save_path.format("Base"))
        elif option == "retrain":
            print('Retraining FastText model...')
            model = fasttext.FastText.load(model_paths)
            model.train(NextSentMem(data),
                        epochs=epochs,
                        total_examples=model.corpus_count,
                        compute_loss=True,
                        report_delay=1.0,
                        callbacks=[BatchLogger(model_save_path)])
            model.save(model_save_path.format("Ret"))
    print("Process ended!", time.time() - start)
    return model
示例#16
0
def train_unsup():
    print("start train gensim fasttext unsup model")
    model = fasttext.FastText(size=256,
                              window=5,
                              min_count=1,
                              word_ngrams=0,
                              workers=8)
    # scan over corpus to build the vocabulary
    model.build_vocab(corpus_file=fileConfig.dir_fasttext +
                      fileConfig.file_fasttext_unsup_train_data)
    total_words = model.corpus_total_words  # number of words in the corpus
    print('train...')
    model.train(corpus_file=fileConfig.dir_fasttext +
                fileConfig.file_fasttext_unsup_train_data,
                total_words=total_words,
                epochs=3)
    model.save(fileConfig.dir_fasttext +
               fileConfig.file_fasttext_gensim_unsup_model)
    print("success train gensim fasttext unsup model")
 def get_model(self,
               hs=1,
               negative=5,
               cbow_mean=0,
               iter=10,
               size=100,
               min_count=5,
               max_vocab_size=1000000,
               workers=3,
               articles_to_learn=1000,
               randomTrain=False):
     dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
     if (self.dev_mode):
         sentences1 = MySentences(
             dir_path +
             '/DataSet')  # Gets all files from folder at location.
     else:
         print(
             "Training model, be aware this is on a real trainingset, so it might take a while"
         )
         sentences1 = ZippedSentences(
             dir_path + '/RealDataSet/wiki_flat.zip', articles_to_learn,
             randomTrain
         )  #Make train-data from a large sample of data using articles_to_learn articles
     Fast_Text_model = fasttext.FastText(
         sentences=sentences1,  # Sentences to train from
         sg=1,  # 0 for CBOW, 1 for Skip-gram
         hs=
         hs,  # 1 for hierarchical softmax and 0 and non-zero in negative argument then negative sampling is used.
         negative=
         negative,  # 0 for no negative sampling and above specifies how many noise words should be drawn. (Usually 5-20 is good).
         cbow_mean=
         cbow_mean,  # 0 for sum of context vectors, 1 for mean of context vectors. Only used on CBOW.
         iter=iter,  # number of epochs.
         size=size,  # feature vector dimensionality
         min_count=min_count,  # minimum frequency of words required
         max_vocab_size=
         max_vocab_size,  # How much RAM is allowed, 10 million words needs approx 1GB RAM. None = infinite RAM
         workers=workers,  # How many threads are started for training.
     )
     self.model = Fast_Text_model
     return Fast_Text_model
示例#18
0
def main():
    corpus_file = "data.txt"
    iter_count = 1

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    f = open("data/%s" % corpus_file, "r")
    text = f.read()
    sentences = [s.split(" ") for s in text.split("\n")]

    for i in range(1, 2):
        model = fasttext.FastText(min_count=1,
                                  seed=1,
                                  workers=1,
                                  iter=iter_count)
        model.build_vocab(sentences)
        model.train(sentences,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        model.save("model/fasttext_gensim_iter=100_%s.model" % i)
示例#19
0
 def get_model(self):
     zips = ZippedSentences(
         'wiki_flat.zip', self.articles_to_learn
     )  #Extract x number of articles from training set.
     Fast_Text_model = fasttext.FastText(
         sentences=zips,  #Sentences to train from
         sg=1,  #0 for CBOW, 1 for Skip-gram
         hs=
         1,  #1 for hierarchical softmax and 0 and non-zero in negative argument then negative sampling is used.
         negative=
         1,  #0 for no negative sampling and above specifies how many noise words should be drawn. (Usually 5-20 is good).
         iter=10,  #number of epochs.
         size=100,  #feature vector dimensionality
         min_count=5,  #minimum frequency of words required
         max_vocab_size=
         None,  #How much RAM is allowed, 10 million words needs approx 1GB RAM. None = infinite RAM
         workers=3,  #How many threads are started for training.
         min_n=
         3,  #Minimum length of char n-grams for word representations, (4 means a word of 5 will be split into 4 parts, an extra beginning part and end part is added to words)
         max_n=6,  #Maximum length of char n-grams
         word_ngrams=1  #1 means using char n-grams, 0 equals word2vec.
     )
     return Fast_Text_model
示例#20
0
def fastText_train(train_file, save_model_name):
    '''
    训练词向量模型(fastText版)
    INPUT  -> 训练语料地址, 模型保存的名称
    '''
    corpus_path = FILE_DIR + '/' + train_file
    model_path = FILE_DIR + '/' + save_model_name + '.bin'

    corpus_file = datapath(corpus_path)
    model = fasttext.FastText(
        corpus_file,  # 训练语料
        sg=1,  # 1是skip-gram算法(对低频词敏感),0是CBOW算法
        size=150,  # 是输出词向量的维数,一般取100-200间(太小会导致映射冲突,太大消耗内存) 
        window=5,  # 句子中当前词语目标词之间的最大距离(前看n个词,后看n个词) 
        min_count=1,  # 对词进行过滤,小于n的词会被忽视,默认为5
        alpha=0.025,  # 学习率
        workers=4,  # 并发训练时候的线程数,仅当Cython安装的情况下才会起作用
        iter=5,  # 训练周期,默认是5
    )

    # model.save(FILE_DIR+'/'+save_model_name)
    # 以二进制类型保存模型以便重用
    model.wv.save_word2vec_format(model_path, binary=True)
示例#21
0
def ft(model_name, iter_count):
    """
    fasttext
    """

    print("prepare data.")
    os.chdir("data")
    set_data(mode="word")
    corpus_file = "tmp.txt"
    f = open("%s" % corpus_file, "r", encoding="utf-8")
    text = f.read()
    sentences = [s.split(" ") for s in text.split("\n")]

    print("train model.")
    # workers=1にしなければseed固定は意味がない(ドキュメントより)
    model = fasttext.FastText(min_count=1, seed=1, workers=1, iter=iter_count)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=model.iter)

    print("save model.")
    os.chdir("..")
    model.save("model/%s" % model_name)
示例#22
0
 def result(self):
     return fasttext.FastText(sentences=self._sentence_gen).wv
示例#23
0
for line in f_5:
    line = line.rstrip()
    sentences.append(tokenizer.tokenize(line))

print("Finished File 5")

for line in f_6:
    line = line.rstrip()
    sentences.append(tokenizer.tokenize(line))

print("Finished File 6")

#count tokens, each one being a sentence
token_count = sum([len(sentence) for sentence in sentences])
print("The Sinhala corpus contains {0:,} tokens".format(token_count))

#define fasttext model. User sg (0 or 1) argument to choose between CBOW and Skipgram
model = fasttext.FastText(size=300, window=10, min_count=1, workers=8, sg=1)

model.build_vocab(sentences=sentences)

#train fasttext model
model.train(sentences=sentences, total_examples=len(sentences), epochs=50)

#save model
if not os.path.exists("trained_fasttext_300_nsw"):
    os.makedirs("trained_fasttext_300_nsw")

model.save(os.path.join("trained_fasttext_300_nsw", "fasttext_100_nsw.w2v"))
示例#24
0
def train_fasttext_model(infile_name,
                         outfile_name=None,
                         dim=100,
                         ws=4,
                         min_count=3,
                         n_jobs=1,
                         minn=1,
                         maxn=2,
                         method='cbow',
                         epoch=30):
    """
    training fasttext (Parallel2vec) model on corpus file extracted from molecules

    - parameters in FastText
    https://fasttext.cc/docs/en/options.html

    - parameters in gensim
    https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb

    - parameters of fasttext in gensim vs original FastText

        sg=0 means using 'cbow' model,
        size means dim, window means ws, iter means epoch,
        min_count means minCount, min_n means minn, max_n means maxn

    :param infile_name: Path to the file on disk, a file that contains sentences(one line = one sentence).
           Words must be already preprocessed and separated by whitespace.
    :param outfile_name:
    :param dim: size of word vectors [100]
    :param ws: size of the context window [4]
    :param min_count: minimal number of word occurrences [3]
    :param n_jobs:
    :param minn: min length of char ngram [1]
    :param maxn: max length of char ngram [2]
    :param method: skip-gram / cbow [cbow]
    :param epoch: number of epochs [30]
    :return: fasttext model
    """

    if method.lower() == 'skip-gram':
        sg = 1
    elif method.lower() == 'cbow':
        sg = 0
    else:
        raise ValueError('skip-gram or cbow are only valid options')

    start = timeit.default_timer()
    model = fasttext.FastText(sg=sg,
                              size=dim,
                              window=ws,
                              min_count=min_count,
                              min_n=minn,
                              max_n=maxn,
                              workers=n_jobs)
    # model = word2vec.Word2Vec(corpus, size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg,
    #                           **kwargs)
    # corpus = word2vec.LineSentence(infile_name)
    print('>>> Start to read molecular sentences...')
    model.build_vocab(corpus_file=infile_name)
    print('Count of molecular sentences: {}, count of unique fragment: {}'.
          format(model.corpus_count, len(model.wv.vocab)))
    print('>>> Start to training model...')
    abc = model.train(corpus_file=infile_name,
                      total_examples=model.corpus_count,
                      epochs=epoch,
                      total_words=len(model.wv.vocab))
    try:
        print('return values of model training: {}'.format(abc))
    except:
        pass
    if outfile_name:
        # fname = get_tmpfile("fasttext.model")
        model.save(outfile_name)

    stop = timeit.default_timer()
    print('Runtime: ', round((stop - start) / 60, 2), ' minutes')
    return model
示例#25
0
from gensim.models import word2vec
from gensim.models import fasttext
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = word2vec.Text8Corpus('text8/text8')

model = fasttext.FastText(sentences)

model.save('FT8/fasttext.model')