示例#1
0
def train_fasttext_fashionrec(dimensionality, context, train_model, epochs):
    """ Train with FastText on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    start_time = datetime.now()
    model = train_unsupervised(input=os.path.join("data/clean2_corpus.txt"),
                               dim=dimensionality,
                               ws=context,
                               model=train_model,
                               epoch=15)
    time_elapsed = datetime.now() - start_time
    output_path = "trained/fasttext_fashion_dim" + str(
        dimensionality) + "_c" + str(context) + "_" + str(train_model)
    model.save_model(output_path + ".bin")
    save_fasttext_bin_to_vec(load_model(output_path + ".bin"),
                             output_path + ".vec")
    fileName = "results/training/fasttext_fashion_epoch" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(context) + "_" + str(
            train_model) + ".txt"
    notes = "FastText FashionData, " + str(epochs) + " epochs, " + str(
        dimensionality) + " dim, " + str(context) + " context, " + str(
            train_model) + " train mode\n" + "Training time: " + str(
                time_elapsed)
    save_to_file(fileName, notes)
示例#2
0
 def train_fasttext_skipgram(self, corpus_path,
                             output_path,
                             **kwargs):
     """
     input     training file path (required)
     output         output file path (required)
     lr             learning rate [0.05]
     lrUpdateRate change the rate of updates for the learning rate [100]
     dim            size of word vectors [100]
     ws             size of the context window [5]
     epoch          number of epochs [5]
     minCount      minimal number of word occurences [5]
     minCountLabel
     neg            number of negatives sampled [5]
     wordNgrams    max length of word ngram [1]
     loss           loss function {ns, hs, softmax} [ns]
     bucket         number of buckets [2000000]
     minn           min length of char ngram [3]
     maxn           max length of char ngram [6]
     thread         number of threads [12]
     t              sampling threshold [0.0001]
     """
     print("Training Fasttext model using Skipgram method")
     self.fasttext_model = fasttext.train_unsupervised(corpus_path, model='skipgram', **kwargs)
     self.fasttext_model.save_model(path=output_path)
     print("Model saved!")
示例#3
0
def buildW2VDict(method='FastText'):
    w2vdict = {}  # the value should be (D, ) ndarray
    if method == 'FastText':  #https://fasttext.cc/docs/en/english-vectors.html  download wordvectors
        w2vmodel = train_unsupervised(embedding_txt_file,
                                      model='skipgram',
                                      lr=0.05,
                                      dim=D,
                                      ws=2,
                                      epoch=5,
                                      minCount=2,
                                      minCountLabel=0,
                                      minn=3,
                                      maxn=6,
                                      neg=5,
                                      wordNgrams=3,
                                      loss='ns',
                                      bucket=2000000,
                                      thread=5,
                                      lrUpdateRate=100,
                                      t=0.0001,
                                      label='__label__',
                                      verbose=2,
                                      pretrainedVectors='')
        #w2vmodel.save_model(ftmodel_file)
        print('fastText word vector model trained as ftmodel.bin')
        # Turn it into dict
        (word, freq) = w2vmodel.get_words(include_freq=True)
        for w, f in zip(word, freq):
            w2vdict[w] = w2vmodel.get_word_vector(w)
    print('w2vdict built.')
    return w2vdict, w2vmodel
示例#4
0
def train_fasttext(corpus, cut_func, vocabulary, embedding_dim=300):
    corpus = [' '.join(cut_func(sentence)) for sentence in corpus]
    corpus_file_path = 'fasttext_tmp_corpus.txt'
    with open(corpus_file_path, 'w', encoding='utf8') as writer:
        for sentence in corpus:
            writer.write(sentence + '\n')

    model = train_unsupervised(input=corpus_file_path,
                               model='skipgram',
                               epoch=10,
                               minCount=1,
                               wordNgrams=3,
                               dim=300)

    model_vocab = model.get_words()

    emb = np.zeros(shape=(len(vocabulary) + 1, embedding_dim), dtype='float32')
    nb_unk = 0
    for w, i in vocabulary.items():
        if w not in model_vocab:
            nb_unk += 1
            emb[i, :] = np.random.normal(0, 0.05, embedding_dim)
        else:
            emb[i, :] = model.get_word_vector(w)
    print(
        'Logging Info - Fasttext Embedding matrix created: {}, unknown tokens: {}'
        .format(emb.shape, nb_unk))
    os.remove(corpus_file_path)
    return emb
示例#5
0
def train(corpus, model_file):
    model = fastText.train_unsupervised(input=corpus,
                                        model='skipgram',
                                        dim=100,
                                        epoch=50,
                                        bucket=200000,
                                        wordNgrams=2)
    model.save_model(model_file)
示例#6
0
def get_embedding_model(file, model_path, **kwargs):
    # 加载/训练词向量模型
    if os.path.exists(model_path):
        model = ft.load_model(model_path)
    else:
        model = ft.train_unsupervised(file, **kwargs)
        model.save_model(model_path)
    return model
示例#7
0
def build_unsupervised_model(data, kwargs):
    kwargs = default_kwargs(kwargs)
    with tempfile.NamedTemporaryFile(delete=False) as tmpf:
        for line in data:
            tmpf.write((line + "\n").encode("UTF-8"))
        tmpf.flush()
        model = train_unsupervised(input=tmpf.name, **kwargs)
    return model
示例#8
0
 def __init__(self, pretrain=True, update=True, fasttext_corpus_path=None):
     if pretrain:
         self.model = fastText.load_model('resource/cc.en.300.bin')
         return
     model_path = re.sub(r'\.txt$', '.model', fasttext_corpus_path)
     if update or not os.path.exists(model_path):
         self.model = fastText.train_unsupervised(input=fasttext_corpus_path, model='skipgram')
         self.model.save_model(model_path)
     else:
         self.model = fastText.load_model(model_path)
示例#9
0
 def train_embeddings(self, path_to_songs):
     """
     Train fastText embeddings based on input file
     :param path_to_songs:
     :return:
     """
     self.model_ft = fastText.train_unsupervised(
         input=path_to_songs,
         model='skipgram',
     )
     self.model_ft.save_model(self.model_path + '1.bin')
示例#10
0
def train_skipgram(processed_path):
    print("Train fastText...")
    model = train_unsupervised(input=processed_path + "/train_no_mark.txt",
                               model='skipgram',
                               dim=100,
                               minCount=5,
                               ws=10,
                               lrUpdateRate=1000,
                               epoch=50,
                               thread=60)
    model.save_model(processed_path + '/fasttext.bin')
示例#11
0
def unsupervised_trainer_for_germeval():
    """
    This method tries to train fasttext on db+germeval combined data
    :return:
    """
    input_file = os.path.join(*[os.path.curdir, 'dataset', 'combined_db_tweets_and_germeval_train.txt'])
    output_file = os.path.join(*[os.path.curdir, 'dataset', 'germeval_100.de.bin'])
    # with open(input_file, 'r') as f:
    #     for line in f:
    #         print(fastText.tokenize(line))
    germeval_trained = fastText.train_unsupervised(input = input_file, lr = .05, dim = 100, epoch = 100)
    germeval_trained.save_model(path = output_file)
示例#12
0
def train_fasttext(data: pandas.DataFrame, config: Optional[Mapping[str, Any]] = None) -> None:
    """
    Train fasttext model on the given dataset of code identifiers.

    :param data: Dataframe with columns Columns.Split and Columns.Frequency.
    :param config: Parameters for training the model, options:
                   size: Number of identifiers to pick from the given data to train fasttext on.
                   corrupt: Value indicating whether to make random artificial typos in \
                            the training data. Identifiers are corrupted with `typo_probability`.
                   typo_probability: Token corruption probability if `corrupt == True`.
                   add_typo_probability: Probability of second corruption in a corrupted token. \
                                         used if `corrupt == True`.
                   path: Path where to store the trained fasttext model.
                   dim: Number of dimensions for embeddings in the new model.
                   bucket: Number of hash buckets to keep in the fasttext model: \
                           the less there are, the more compact the model gets.
                   adjust_frequencies: Whether to divide frequencies by the number of tokens in \
                                       the identifiers. Needs to be done when the result of the \
                                       `prepare` function is used as data to have a true \
                                       identifiers distribution.
    """
    try:
        import fastText
    except ImportError:
        sys.exit("Please install fastText."
                 "Run `pip3 install git+https://github.com/facebookresearch/fastText"
                 "@51e6738d734286251b6ad02e4fdbbcfe5b679382`")
    log = logging.getLogger("train_fasttext")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["fasttext"], config)
    tokens_number = data[Columns.Split].apply(lambda x: len(str(x).split()))
    if config["adjust_frequencies"]:
        weights = data[Columns.Frequency] / tokens_number
    else:
        weights = data[Columns.Frequency]
    train_data = data[tokens_number > 1].sample(config["size"], weights=weights, replace=True)
    if config["corrupt"]:
        train_data = corrupt_tokens_in_df(train_data, config["typo_probability"],
                                          config["add_typo_probability"])
    with tempfile.NamedTemporaryFile() as ids_file:
        with open(ids_file.name, "w") as f:
            for token_split in train_data[Columns.Split]:
                f.write(token_split + "\n")
        log.info("Training fasttext model...")
        model = fastText.train_unsupervised(ids_file.name, minCount=1, epoch=10,
                                            dim=config["dim"],
                                            bucket=config["bucket"])
    model.save_model(config["path"])
    log.info("fasttext model is saved to %s", config["path"])
 def build_local_embeddings(self, corpus, path_to_resulting_embeddings):
     if not os.path.isdir('temp/'):
         os.mkdir('temp/')
     with open(
             'temp/corpus.txt',
             'w',
             encoding='utf-8',
     ) as f:
         f.writelines(corpus)
     ft = fastText.train_unsupervised('temp/corpus.txt', minCount=1)
     ft.save_model('temp/ft.bin')
     del ft
     self.compress_embeddings(corpus, path_to_resulting_embeddings, 'pca',
                              'temp/ft.bin')
     shutil.rmtree('temp/')
示例#14
0
def train_w2v_model(data):
    """
    Train a w2v (skipgram) model using fasttext package

    Args:
        data: A path to the training data (String)
    """
    logger.info('Fasttext embeddings training...')
    try:
        model = train_unsupervised(input=data,
                                   model='skipgram',
                                   epoch=100,
                                   minCount=1,
                                   dim=100)
        model.save_model(str(path.join(data_dir, 'W2V_Models/model.bin')))
    except Exception as e:
        logger.error('Error: %s', str(e))
示例#15
0
    def search_hyperparameters(self,
                               nb_trials: int,
                               input_path: str,
                               props: str = 'w+l+t+m+n'):

        for trial in range(nb_trials):
            parameters = self.tuner.propose()
            pprint(parameters)
            ''' Construct and train the model '''
            model = train_unsupervised(input=input_path,
                                       props=props,
                                       **parameters)
            ''' Track results '''
            score = fasttexteval.evaluate_model(
                model=model,
                word_pairs=self.train_word_pairs,
                gold_similarity=self.train_similarity)
            self.tuner.add(parameters, score)
            print(f'Evaluation score: {score}')
示例#16
0
    def test_unsup1(self):
        train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9")

        model = train_unsupervised(
            input=train,
            model="skipgram",
            lr=0.025,
            dim=100,
            ws=5,
            epoch=1,
            minCount=5,
            neg=5,
            loss="ns",
            bucket=2000000,
            minn=3,
            maxn=6,
            t=1e-4,
            lrUpdateRate=100,
            thread=self.num_thread(),
        )
        model.save_model(output)

        path_size = self.get_path_size(output)
        vectors = {}
        with open(test, 'r') as test_f:
            for line in test_f:
                query0 = line.split()[0].strip()
                query1 = line.split()[1].strip()
                vector0 = model.get_word_vector(query0)
                vector1 = model.get_word_vector(query1)
                vectors[query0] = vector0
                vectors[query1] = vector1
        dataset, correlation, oov = compute_similarity(None, test, vectors)
        correlation = np.around(correlation)

        self.assertTrue(
            correlation >= 41, "Correlation: Want: 41 Is: " + str(correlation)
        )
        self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov))
        self.assertEqual(
            path_size, 978480868, "Size: Want: 978480868 Is: " + str(path_size)
        )
示例#17
0
def train_embeddings(vocab,
                     corpus,
                     output_file,
                     epoch=500,
                     dim=100,
                     min_count=1):
    ft = train_unsupervised('./data/' + corpus,
                            epoch=epoch,
                            dim=dim,
                            minCount=min_count)

    with open('./data/' + vocab, 'r', encoding='utf8') as f:
        words = f.read().splitlines()
    with open('./data/' + output_file, 'w', encoding='utf8') as f:
        f.write('%d %d\n' % (len(words), dim))
        for word in words:
            vec = ft.get_word_vector(word)
            out = word + ' ' + ' '.join(
                [str(x) for x in np.around(vec, decimals=4)]) + '\n'
            f.write(out)
示例#18
0
def fit():
    '''
    fastText训练语料库,save:local_model.bin;利用 预处理和分词 后的pre_data.txt 训练 sentence-vector
    :return: fit result
    '''
    # 数据预处理。返回 id_url_df 用于结果拼接(id,url)
    id_url_df = data_process()
    print('fit 数据预处理,分词完成')

    # fastText fit
    model = train_unsupervised(
        # input=os.path.join(config.jb_path, 'pre_data.txt'),
        input=os.path.join(config.jb_path, 'del_dp_text_data.txt'),
        model='skipgram',
        epoch=10,
        dim=300,
        # pretrainedVectors="{}/wiki.zh.vec".format(config.model_path),
        minCount=1
    )

    model.save_model("{}/local_model.bin".format(config.model_path))
    print('local_model.bin  saved')

    vector_list = []
    with open(os.path.join('{}pre_data.txt'.format(config.jb_path))) as f:
        for line in f:
            line = line.replace('\n', '')
            vector = model.get_sentence_vector(line)
            vector_list.append(vector.tolist())

    # 组装 pid + vec 字典
    pid = id_url_df['id'].values
    pid_list = pid.tolist()
    id_vec_dict = dict(itertools.zip_longest(pid_list, vector_list))

    # pid,vector持久化为json
    with open('{}id_vec_dict.json'.format(config.model_path), 'w') as outfile:
        json.dump(id_vec_dict, outfile)
        print('id_vec_dict.json 持久化完成')

    return 'fit success!!!'
示例#19
0
    def test_unsup1(self):
        train, test, output = self.build_paths("fil9", "rw/rw.txt", "fil9")

        model = train_unsupervised(
            input=train,
            model="skipgram",
            lr=0.025,
            dim=100,
            ws=5,
            epoch=1,
            minCount=5,
            neg=5,
            loss="ns",
            bucket=2000000,
            minn=3,
            maxn=6,
            t=1e-4,
            lrUpdateRate=100,
            thread=self.num_thread(),
        )
        model.save_model(output)

        path_size = self.get_path_size(output)
        vectors = {}
        with open(test, 'r') as test_f:
            for line in test_f:
                query0 = line.split()[0].strip()
                query1 = line.split()[1].strip()
                vector0 = model.get_word_vector(query0)
                vector1 = model.get_word_vector(query1)
                vectors[query0] = vector0
                vectors[query1] = vector1
        dataset, correlation, oov = compute_similarity(None, test, vectors)
        correlation = np.around(correlation)

        self.assertTrue(correlation >= 41,
                        "Correlation: Want: 41 Is: " + str(correlation))
        self.assertEqual(oov, 0.0, "Oov: Want: 0 Is: " + str(oov))
        self.assertEqual(path_size, 978480868,
                         "Size: Want: 978480868 Is: " + str(path_size))
示例#20
0
def train_unsup():
    # train_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_train_data, mode='r', encoding='utf-8')
    # train_lines = []
    # for line in train_file:
    #     train_lines.append(line)
    print("start train unsupervied fasttext model")
    model = fastText.train_unsupervised(
        input=fileConfig.dir_fasttext +
        fileConfig.file_fasttext_unsup_train_data,
        model=fasttextConfig.choose_model,
        dim=128,
        minCount=3,
        wordNgrams=7,
        minn=2,
        maxn=6,
        lr=0.1,
        thread=8,
        epoch=25,
        loss='hs')
    model.save_model(
        fileConfig.dir_fasttext +
        fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
示例#21
0
def train_fasttext(corpus,
                   vocabulary,
                   zero_init_indices=0,
                   rand_init_indices=1,
                   embedding_dim=300):
    """Use fasttext to train on corpus to obtain embedding

        Args:
            corpus: list of tokenized texts, corpus to train on
            vocabulary: dict, a mapping of words to indices
            zero_init_indices: int or a list, the indices which use zero-initialization. These
                               indices usually represent padding token.
            rand_init_indices: int or a list, the indices which use randomly-initialization.These
                               indices usually represent other special tokens, such as "unk" token.
            embedding_dim: int, dimensionality of embedding

        Returns: np.array, a word embedding matrix.

        """
    corpus_file_path = 'fasttext_tmp_corpus.txt'
    with open(corpus_file_path, 'w', encoding='utf8') as writer:
        for sentence in corpus:
            writer.write(' '.join(sentence) + '\n')

    model = train_unsupervised(input=corpus_file_path,
                               model='skipgram',
                               epoch=10,
                               minCount=1,
                               wordNgrams=3,
                               dim=embedding_dim)
    model_vocab = model.get_words()
    word_vectors = dict((w, model.get_word_vector(w)) for w in model_vocab)
    emb = filter_embeddings(word_vectors, embedding_dim, vocabulary,
                            zero_init_indices, rand_init_indices)
    os.remove(corpus_file_path)
    return emb
    mysim = []
    gold = []

    with open(data_path, 'rb') as fin:
        for line in fin:
            tline = line.split()
            word1 = tline[0].lower()
            word2 = tline[1].lower()

            v1 = model.get_word_vector(word1)
            v2 = model.get_word_vector(word2)
            d = similarity(v1, v2)
            mysim.append(d)
            gold.append(float(tline[2]))

    corr = stats.spearmanr(mysim, gold)
    dataset = os.path.basename(data_path)
    correlation = corr[0] * 100
    return dataset, correlation, 0


if __name__ == "__main__":
    model = train_unsupervised(
        input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
        model='skipgram',
    )
    model.save_model("fil9.bin")
    dataset, corr, oov = compute_similarity('rw.txt')
    print("{0:20s}: {1:2.0f}  (OOV: {2:2.0f}%)".format(dataset, corr, 0))
示例#23
0
        help="Output model filename",
    )

    args = parser.parse_args()

    input_filename = args.input
    output_filename = args.output

    model = train_unsupervised(
        input=input_filename,
        model=model,
        lr=lr,
        dim=dim,
        ws=ws,
        epoch=epoch,
        minCount=minCount,
        minCountLabel=minCountLabel,
        minn=minn,
        maxn=maxn,
        neg=neg,
        wordNgrams=wordNgrams,
        loss=loss,
        bucket=bucket,
        thread=thread,
        lrUpdateRate=lrUpdateRate,
        t=t,
        verbose=verbose,
    )

    model.save_model(output_filename)
示例#24
0
        return np.dot(v1, v2) / n1 / n2

    mysim = []
    gold = []

    with open(data_path, 'rb') as fin:
        for line in fin:
            tline = line.split()
            word1 = tline[0].lower()
            word2 = tline[1].lower()

            v1 = model.get_word_vector(word1)
            v2 = model.get_word_vector(word2)
            d = similarity(v1, v2)
            mysim.append(d)
            gold.append(float(tline[2]))

    corr = stats.spearmanr(mysim, gold)
    dataset = os.path.basename(data_path)
    correlation = corr[0] * 100
    return dataset, correlation, 0


if __name__ == "__main__":
    model = train_unsupervised(
        input=os.path.join(os.getenv("DATADIR", ''), 'fil9'),
        model='skipgram',
    )
    dataset, corr, oov = compute_similarity('rw.txt')
    print("{0:20s}: {1:2.0f}  (OOV: {2:2.0f}%)".format(dataset, corr, 0))
示例#25
0
if __name__ == '__main__':
    import fastText
    from data_utils.constants import ALL_TEXTS, WORD_VEC_PATH
    model = fastText.train_unsupervised(ALL_TEXTS,
                                        model='cbow',
                                        lr=0.05,
                                        dim=300,
                                        ws=5,
                                        epoch=50,
                                        minCount=5,
                                        maxn=0)
    model.save_model(WORD_VEC_PATH)
示例#26
0
print("CWD: {}".format(Path.cwd()))

print("Creating temporary file.")
temp_file = NamedTemporaryFile(mode="w+", delete=False)  # type: TextIOWrapper
print("Temporary file at: {}".format(temp_file.name))

print("Loading all programs' tokens")
connection = sqlite3.connect(str(database_path))
cursor = connection.cursor()
rows = cursor.execute("SELECT tokens FROM tagger").fetchall()

print("Loading programs into temporary file")
for sentence in rows:
    sentence = json.loads(sentence[0])
    for token in sentence:  # type: str
        temp_file.write(" " + token)

temp_file.seek(0)
print("Make fast-text model.")
model = fastText.train_unsupervised(
    input=temp_file.name,
    #lr=0.1,
    epoch=500,
    minCount=4,
    model="skipgram",
    thread=18)

# Save model
model.save_model(str(Path(storage_folder, "model.bin")))
示例#27
0
import fastText as ft

def make_wordsfile(document_list):
	"""
	fastText学習用にテキストデータを作成する
	:param document_list: 文書リスト
	:return:
	"""
	model_text = ""
	for document in document_list["news"]:
		model_text += " ".join(document) + "\n"

	w = open("./model/for_papers/dataset.txt", "w")
	w.write(model_text)
	w.close()

model = ft.train_unsupervised(input="../model/for_papers/dataset.txt", model="skipgram", dim=200, ws=10,
                              minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=1, t=1e-3)
model.save_model("./model/for_papers/fastText.model")
# model = ft.train_unsupervised(input="./model/for_papers/dataset.txt", model="skipgram", dim=200, ws=10,
#                               minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=1, minn=3, maxn=6, t=1e-3)
# model = ft.train_unsupervised(input="./data/fastText2013to2017_dataSet_2.txt", model="skipgram", dim=200, ws=10,
#                               minCount=20, loss="ns", neg=10, epoch=25, thread=40, wordNgrams=2)
# # model = ft.train_unsupervised(input="./data/fastText2013to2017_dataSet.txt", model="skipgram", dim=100, ws=10, minCount=20, epoch=25, thread=4, wordNgrams=2)
# model.save("./model/fastText2013to2017.model")
示例#28
0
import fastText

output = fastText.train_unsupervised("walk.txt", model='skipgram',lr = 0.005, dim=100, ws=15, epoch=20, minCount=0, minCountLabel=0, minn=0, maxn=0, neg=10, wordNgrams=1, loss='ns', bucket=2000000, thread=4, lrUpdateRate=100, t=0.0001, label='__label__', verbose=2, pretrainedVectors='')
words = output.get_words()
output.save_model("embeddings_weights.bin")
f = open("embeddings2_weights.txt","w+")
for word in words:
	v = output.get_word_vector(word)
	vstr = ""
	for vi in v:
		vstr += " " + str(vi)
	f.write(word + vstr+ "\n")
f.close()

示例#29
0
# 预测
model = load_model("./cooking.bin")
print(model.predict("Which baking dish is best to bake a banana bread ?"))
print(model.predict("Why not put knives in the dishwasher?"))
# (('__label__baking',), array([0.35784602]))
# (('__label__equipment',), array([0.39477548]))

model = load_model("./cooking.ftz")
print(model.predict("Which baking dish is best to bake a banana bread ?"))
print(model.predict("Why not put knives in the dishwasher?"))
# (('__label__bread',), array([0.32475984]))
# (('__label__equipment',), array([0.49320737]))

# 无监督学习
model = train_unsupervised(input=train_data, model='skipgram')
model.save_model("cooking_uns.bin")
# Read 0M words
# Number of words:  2408
# Number of labels: 735
# Progress: 100.0% words/sec/thread:   82933 lr:  0.000000 loss:  2.764836 ETA:   0h 0m

# 查看词向量
model = load_model("cooking_uns.bin")
print("banana:", model.get_word_vector("banana"))
print("apple:", model.get_word_vector("apple"))
# 数据量太大,格式如下
# banana: [-1.87938347e-01 -4.34164740e-02  1.01463743e-01 -9.05684754e-02 ...]
# apple: [-1.83095217e-01 -4.92684692e-02  1.06943615e-01 -8.55036154e-02 ...]

# 查看label词频
示例#30
0
import os
import sys
import pandas as pd
import util.fast_text as ft

from fastText import train_unsupervised

if __name__ == "__main__":
    dataset = "data/unsupervised/seqlist_data_set.tsv"
    model_name = "model/rnasequences2vec.bin"

    print("Train...", dataset)
    model = train_unsupervised(input=dataset,
                               model='cbow',
                               lr=0.01,
                               dim=200,
                               wordNgrams=4,
                               minCount=1,
                               epoch=10)

    print("Save...")
    model.save_model(model_name)

    print("Create .vec file...")
    ft.bin_to_vec(model_name)