Exemplo n.º 1
0
          encoding='utf_8') as t:
    with open(path, 'r', encoding='utf_8') as f:
        lines = csv.reader(f, delimiter='\t')
        for line in lines:
            target = line[2]
            content = line[1]
            t.write(content + '\t' + '_label_' + target + '\n')

#训练模型
classifier = ff.train_supervised(
    r'data\Chinese\Chinese fasttext data\seg_train', label='_label_')
#储存模型
classifier.save_model(
    r'data\Chinese\Chinese fasttext data\fastText_model1')  #保存模型
#加载模型
classifier = ff.load_model(
    r'data\Chinese\Chinese fasttext data\fastText_model1')
#测试模型
correct = 0
total_count = 0
with open(r'data\Chinese\Chinese fasttext data\seg_test',
          'r',
          encoding='utf_8') as t:
    lines = t.readlines()
    total_count = len(lines)
    print(total_count)
    for line in lines:
        txt = line.split('\t')[0]  #根据数据间的分隔符切割行数据
        txt = txt.strip('\n')  #去掉每行最后的换行符'\n'
        predict = classifier.predict(txt)
        if predict[0][0] == line.split('\t')[1].strip('\n'):
            correct += 1
                         thread=1,
                         loss="softmax")
print_results(*model.test(test_data))

model.save_model(r"C:\Users\RY\Desktop\wikizhfasttext.bin")

model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
print_results(*model.test(valid_data))
model.save_model(r"C:\Users\RY\Desktop\wikizhfasttext.ftz")

# 加载训练好的模型

from fastText import FastText

model_path = r"C:\Users\RY\Desktop\wikizhfasttext.bin"
model = FastText.load_model(model_path)

# help(model)
# 1. model.get_dimension() Get the dimension (size) of a lookup vector (hidden layer).
# 2. model.get_input_matrix() Get a copy of the full input matrix of a Model. This only
#    works if the model is not quantized.
# 3. model.get_input_vector() Given an index, get the corresponding vector of the Input Matrix.
# 4. model.get_labels() Get the entire list of labels of the dictionary optionally
#    including the frequency of the individual labels. Unsupervised
#    models use words as labels, which is why get_labels
#    will call and return get_words for this type of
#    model.
# 5. model.get_line() Split a line of text into words and labels. Labels must start with
#    the prefix used to create the model (__label__ by default)
# 6. model.get_output_matrix() Get a copy of the full output matrix of a Model. This only
#    works if the model is not quantized.
print("你输入的内容是:",str_data)
outstr=''
import jieba
import re

jieba.load_userdict(os.path.join(filename,"dictionary.txt"))#载入词典

stopwords_file = os.path.join(filename,"stop_words.txt")

stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
        continue
    stop_words.append(line)
stop_f.close ()

str_data=re.sub(r'[A-Za-z0-9]|/d+', '', str_data)#去除英文与数字
str_data=jieba.cut(str_data,cut_all=False,HMM=True)

for word in str_data:
    if word not in stop_words:
        if word != '\t':
            outstr += word
            outstr += " "

classifier = ff.load_model(os.path.join(filename,'model/model1.model'))
test = classifier.predict(outstr,k=1,threshold=0.5)
print("输入商品的标签为:",str(test[0]).replace("__label__",''))
print("商品属于该标签的概率为:",test[1])
    def run(self, save_directory=None):
        """

        :param save_directory: filepath where we save the text embedding
        :type save_directory: str
        :return: A 3d-array matrix storing the text embedding.
            The matrix is of shape (nb_sentences, max_sentence_length, embedding_size)
        """
        if self.verbose:
            print("Loading FastText model...")
        model = FastText.load_model(MODEL_PATH)
        drug_embedding_path = utils.get_drug_embedding_path()

        sentences_list = []

        if self.drug_description_embedding:
            try:
                with open(drug_embedding_path, 'rb') as f:
                    drug_embeddings = pickle.load(f)
            except FileNotFoundError:
                self.drug_description_embedding = False
                print('Drugs will be embedded as "médicament".')

        for i, sentence in tqdm(enumerate(self.sentences),
                                desc='Embedding words for each sentence...',
                                disable=not self.verbose,
                                total=len(self.sentences)):
            sentence_embedding = []
            sentence = sentence.lower()
            splits = FastText.tokenize(sentence)
            for word in splits:
                # Skipping non-words
                if not re.match('(\w)+', word) or word in self.stop_words:
                    continue

                # Getting rid of the apostrophe and taking the following word
                apos_split = word.split("'")
                if len(apos_split) == 2:
                    _, word = apos_split
                    if not word:
                        continue

                # Dealing with the drug name
                if unidecode(word) in self.drug_names_set:
                    # TODO: try something more complex
                    if self.drug_description_embedding and drug_embeddings:
                        try:
                            emb_w = drug_embeddings[word]
                            sentence_embedding.append(emb_w)
                            continue
                        except KeyError:
                            word = DRUG_REPLACEMENT
                    else:
                        word = DRUG_REPLACEMENT

                # Correcting words
                if self.do_correction and not params.FR_DICT.check(word):
                    suggestions = params.FR_DICT.suggest(word)
                    if suggestions:
                        word = suggestions[0]

                # Embedding
                sentence_embedding.append(model.get_word_vector(word))
            if sentence_embedding:
                sentences_list.append(sentence_embedding)
            else:
                print("Warning: Found an empty sentence embedding. Ignoring.")
                del self.y[i]

        #     # Updating max_sentence_length
        #     sentence_length = len(sentence_embedding)
        #     if sentence_length > max_sentence_length:
        #         max_sentence_length = sentence_length
        #
        # # Padding sentence matrices with 0 vectors
        # text_embedding = []
        # for sentence_embedding in sentences_list:
        #     sentence_length = len(sentence_embedding)
        #     sentence_embedding.extend([np.zeros((utils.get_embedding_dim(),))] \
        #                               * (max_sentence_length - sentence_length))
        #     text_embedding.append(sentence_embedding)
        #
        # # Deleting list of sentences
        # del sentences_list

        embeddings = np.array(sentences_list)
        print("\nSaving text embedding of shape %s" % str(embeddings.shape))

        X_train, X_test, y_train, y_test = train_test_split(embeddings,
                                                            self.y,
                                                            test_size=0.2,
                                                            random_state=42)

        if save_directory:
            np.save(utils.get_X_train_path(save_directory), X_train)
            np.save(utils.get_X_test_path(save_directory), X_test)
            np.save(utils.get_y_train_path(save_directory), y_train)
            np.save(utils.get_y_test_path(save_directory), y_test)

        return embeddings
Exemplo n.º 5
0
import numpy as np
import torch
import os
from textCNN_chinese.textcnn import get_wordlists
import fastText.FastText as ff
from textCNN_chinese.textcnn.test import parse_net_result
from textCNN_chinese.textcnn.model import textCNN
from textCNN_chinese.textcnn import sen2inds

classifier = ff.load_model("fasttext_save\\fastText_model")
testCsvFile = "data\Chinese\Chinese raw data\\re_seg_test.csv"
testFile = 'textCNN_chinese\model_save\\test.txt'
testDataVecFile = 'textCNN_chinese\model_save\\testdata_vec.txt'

word2ind, ind2word = get_wordlists.get_worddict()
label_w2n, label_n2w = sen2inds.read_labelFile('textCNN_chinese\model_save\label.txt')

textCNN_param = {
    'vocab_size': len(word2ind),
    'embed_dim': 50,
    'class_num': len(label_w2n),
    "kernel_num": 20,
    "kernel_size": [3, 4, 5],
    "dropout": 0.5,
}

datas = open(testFile, 'r', encoding='utf_8').read().split('\n')
datas = list(filter(None, datas))
word2ind, ind2word = get_wordlists.get_worddict()
net = textCNN(textCNN_param)
weightFile = 'textCNN_chinese\model_save\\19071915_model_iter_99_loss_3.03.pkl'
d=[]
sum_test=0#计算测试集的行数
with open(test_path,'r',encoding="utf-8") as f_test:
    for line_test in f_test:
        sum_test+=1

for d_count in range(0,sum_test):
    d.append({})

#导入模型来预测测试集文件标签
for count in range(0,1):#此处根据上一步训练所用的分类器更改
    prepare_list = globals()
    if __name__ == '__main__':
        for i in range(1, 6):#一次导入五个模型
            prepare_list['classisier' + str(i)] = ff.load_model("data/bootstarp/model/model" + str(count*5+i) + ".model")
    line_nu=0
    with open(test_product, "r", encoding="utf-8") as f_test:
        for line in f_test:
            line = line.strip()
            dict = d[line_nu]
            for i in range(1, 5):
                str_test = prepare_list['classisier' + str(i)].predict(line, k=5, threshold=0.0)
                for j in range(0, 5):
                    # print(str_test[0][j])
                    dict.setdefault(str_test[0][j], 0)
                    dict[str_test[0][j]] += str_test[1][j]
            d[line_nu]=dict
            line_nu+=1

for count_update in d:#综合每个模型选出最终的标签
                    required=True,
                    help='root directory of output')
parser.add_argument('--no_cuda', action='store_true', help='do not use cuda')
parser.add_argument('--fusing_method',
                    type=str,
                    default='',
                    help='fusing_method')
args = parser.parse_args()

if not args.no_cuda and not torch.cuda.is_available():
    print('Warning: cuda is not available on this machine.')
    args.no_cuda = True

if __name__ == '__main__':
    print('Loading a pretrained fastText model...')
    word_embedding = FastText.load_model(args.fasttext_model)

    print('Loading a pretrained model...')

    txt_encoder = VisualSemanticEmbedding(args.embed_ndim)
    txt_encoder.load_state_dict(torch.load(args.text_embedding_model))
    txt_encoder = txt_encoder.txt_encoder

    G = Generator(use_vgg=args.use_vgg, fusing=args.fusing_method)
    G.load_state_dict(torch.load(args.generator_model))
    G.train(False)

    if not args.no_cuda:
        txt_encoder.cuda()
        G.cuda()
Exemplo n.º 8
0
from torch.autograd import Variable
from dmn_atis.DmnModel import DMN
from dmn_atis.DmnLoader import label_to_index,word_to_index,flatten,ATIS_data_load,prepare_sequence,pad_to_fact,getBatch,pad_to_batch
import random
import numpy as np

random.seed(1024)
np.random.seed(1024)
torch.manual_seed(1024)


HIDDEN_SIZE = 80
BATCH_SIZE = 64
NUM_EPISODE = 3

classifier = ff.load_model("D:\VScode\BERTProject-master\\fasttext_save\\fastText_re_cut word_cross_validation_model0")
testCsvFile = "D:\VScode\BERTProject-master\data\cross validation\\re_cut word_cross_validation_test0.csv"
testFile = "textCNN_chinese\model_save\\re_cut word_cross_validation_test0.txt"
testDataVecFile = 'textCNN_chinese\model_save\\re_cut word_cross_validation_testvec0.txt'

train_data=ATIS_data_load("D:\VScode\BERTProject-master\data\cross validation\\re_cut word_cross_validation_train0.csv", encoding = "utf-8")
fact, q, a = list(zip(*train_data))
vocab = ['在','哪','里','办','理']
for lis in fact:
    for seq in lis:
        for word in seq:
            if word not in vocab:
                vocab.append(word)
word2index,index2word = word_to_index(vocab)

labels = []
def load_fasttext(model_path):
    classifier = ff.load_model(model_path)
    return classifier
Exemplo n.º 10
0
    device_id = cfg["device_id"]
    torch.cuda.set_device(device_id)

if cfg["dataset"] == 'conala':
    save_path = save_path + "_conala"
elif cfg["dataset"] == 'codesearchnet':
    pass
else:
    print("Wrong Dataset Entered")
    exit()
print(f"Model = {cfg['model']}")

# Loading word embeddings
if use_bin:
    import fastText.FastText as ft
    ft_anno_vec = ft.load_model('conala/ft_models/anno_model.bin')
    ft_code_vec = ft.load_model('conala/ft_models/code_model.bin')
else:
    from keras.preprocessing import text, sequence


def prepare_sequence(seq, seq_len, to_ix):
    idxs_list = []

    for seq_elem in seq:
        idxs = []
        for w in seq_elem.split():
            try:
                idxs.append(to_ix[w])
            except KeyError:
                continue
Exemplo n.º 11
0
def fast_text_predict(model_file, txt):
    classifier = ff.load_model(model_file)  # 载入已经训练好的模型
    pre = classifier.predict('i like apple', 10)  # 输出改文本的预测结果
    a = 1
    #Remove stopwords and stem
    ps = PorterStemmer()
    filtered_words = [
        word for word in words if word not in stopwords.words('english')
    ]
    stemmed_words = [ps.stem(word) for word in filtered_words]
    lower_words = [word.lower() for word in stemmed_words]
    text = ' '.join(stemmed_words)

    return text


# Load the model
print "Loading the model"
model = fasttext.load_model(trained_model)
print "Model loaded"
title_row = [
    'rectype', 'sha', 'ins_del_count', 'issueid', 'actor', 'date', 'text',
    'similarity'
]


# Gets the aggregated word vector representation of the text
def GetTextVector(text):
    final_vector = np.zeros((300, ))
    words = text.split(' ')
    for word in words:
        if len(word) > CHAR_THRESHOLD:
            continue
        word_vector = model.get_word_vector(word)
Exemplo n.º 13
0
def predict(text):
    model = fasttext.load_model('models/fastText/fasttext_train.model.bin')
    text = extractWords(text)
    word_list = " ".join(jieba.cut(text))
    label_predict = model.predict(word_list)
    return list(label_predict[0])  #turple 返回list
Exemplo n.º 14
0
def load_model_to_test():
    """
    加载模型进行测试
    :return:
    """
    # 加载测试数据
    correct_labels = []
    texts = []
    with open("fasttext.test", "r", encoding="utf-8") as ft_test:
        for line in ft_test:
            correct_labels.append(line.strip().split(" , ")[0])
            texts.append(line.strip().split(" , ")[1])

    # 加载分类模型
    for w in range(1, 2):
        all_marco_precision = []
        all_marco_recall = []
        all_marco_f1 = []
        all_micro_precision = []
        all_micro_recall = []
        all_micro_f1 = []
        for i in range(5, 51):
            classifier = ff.load_model("Model/model_w" + str(w) + "_e" +
                                       str(i))
            print("Model/model_w" + str(w) + "_e" + str(i))
            # 预测
            predict_labels = classifier.predict(texts)[0]
            # 计算预测结果
            true_positive = 0
            false_positive = 0
            false_negative = 0
            evaluation_parameters = []
            label_to_name = load_label_name_map()[0]
            for label, name in label_to_name.items():
                evaluate_p = {}
                evaluate_p["name"] = name
                evaluate_p["nexample"] = len(texts)
                for i in range(len(texts)):
                    # 预测属于该类,实际属于该类
                    if predict_labels[i] == label and correct_labels[
                            i] == label:
                        true_positive += 1
                    # 预测属于该类,实际不属于该类
                    elif predict_labels[
                            i] == label and correct_labels[i] != label:
                        false_positive += 1
                    # 预测不属于该类,实际属于该类
                    elif predict_labels[i] != label and correct_labels[
                            i] == label:
                        false_negative += 1
                evaluate_p["true_positive"] = true_positive
                evaluate_p["false_positive"] = false_positive
                evaluate_p["false_negative"] = false_negative
                # 计算精确率、召回率、F值
                precision = true_positive / (true_positive + false_positive)
                evaluate_p["precision"] = precision
                recall = true_positive / (true_positive + false_negative)
                evaluate_p["recall"] = recall
                f1 = 2 * precision * recall / (precision + recall)
                evaluate_p["f1"] = f1
                evaluation_parameters.append(evaluate_p)
                # print("%s标签测试结果:" % name)
                # print("测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), precision, recall, f1))
            # 计算宏平均和微平均
            sum_precision = 0
            sum_recall = 0
            sum_true_positive = 0
            sum_false_positive = 0
            sum_false_negative = 0
            for p in evaluation_parameters:
                sum_precision += p["precision"]
                sum_recall += p["recall"]
                sum_true_positive += p["true_positive"]
                sum_false_positive += p["false_positive"]
                sum_false_negative += p["false_negative"]
            n = len(evaluation_parameters)
            marco_precision = sum_precision / n
            all_marco_precision.append(marco_precision)
            marco_recall = sum_recall / n
            all_marco_recall.append(marco_recall)
            marco_f1 = 2 * marco_precision * marco_recall / (marco_precision +
                                                             marco_recall)
            all_marco_f1.append(marco_f1)
            print("宏平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" %
                  (len(texts), marco_precision, marco_recall, marco_f1))
            micro_true_positive = sum_true_positive / n
            micro_false_positive = sum_false_positive / n
            micro_false_negative = sum_false_negative / n
            micro_precision = micro_true_positive / (micro_true_positive +
                                                     micro_false_positive)
            all_micro_precision.append(micro_precision)
            micro_recall = micro_true_positive / (micro_true_positive +
                                                  micro_false_negative)
            all_micro_recall.append(micro_recall)
            micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                             micro_recall)
            all_micro_f1.append(micro_f1)
            print("微平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" %
                  (len(texts), micro_precision, micro_recall, micro_f1))

        names = [i for i in range(5, 51)]
        ax1 = plt.subplot(311)
        plt.plot(names, all_marco_precision, label='marco-P')
        plt.plot(names, all_micro_precision, label='micro-P')
        plt.legend(loc='upper left')
        ax2 = plt.subplot(312, sharey=ax1)
        plt.plot(names, all_marco_recall, label='marco-P')
        plt.plot(names, all_micro_recall, label='micro-R')
        plt.legend(loc='upper left')
        plt.subplot(313, sharey=ax1)
        plt.plot(names, all_marco_f1, label='marco-F1')
        plt.plot(names, all_micro_f1, label='micro-F1')
        plt.legend(loc='upper left')
        plt.xlabel(u"训练轮数(ngram=" + str(w) + ")")
        plt.savefig("./ngram" + str(w) + ".png")
        plt.show()
Exemplo n.º 15
0
import re
from fastText import FastText

classifier = FastText.load_model("./fastText")

with open("./tmp_test_content", "r") as f:
    lines = f.readlines()
    lines = list(map(lambda x: x.rstrip("\n"), lines))

labels, probs = classifier.predict(lines)

labels = list(map(lambda x: re.sub("__label__", "", x[0]), labels))
print(labels)

probs = list(map(lambda x: str(x[0]), list(probs)))

res = list(zip(labels, probs))

res = list(map(lambda x: "\t".join(x), res))

res = list(zip(lines, res))
res = list(map(lambda x: "\t ".join(x), res))

res = "\n".join(res)

with open("res3", "w") as f:
    f.write(res)
Exemplo n.º 16
0
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(lambda: max_idf)
        for w, i in tfidf.vocabulary_.items():
            self.word2weight[w] = tfidf.idf_[i]

    def transform(self, X):
        return np.array([
            np.mean([
                self.fasttext_model.get_word_vector(w) * self.word2weight[w]
                for w in words
            ] or [np.zeros(self.dim)],
                    axis=0) for words in X
        ])


if __name__ == '__main__':
    descriptions_df = pnd.read_csv(utils.get_drugs_indication_path(),
                                   encoding=params.UTF_8)
    fasttext_model = FastText.load_model(MODEL_PATH)
    tfidf_emb = TfidfEmbeddingVectorizer(fasttext_model,
                                         utils.get_embedding_dim())
    tfidf_emb.fit(descriptions_df.descriptions)
    embeddings = tfidf_emb.transform(descriptions_df.descriptions)

    embeddings_dict = {}
    for i in range(descriptions_df.shape[0]):
        embeddings_dict[descriptions_df.loc[i, 'drug_names']] = embeddings[i]

    with open(utils.get_drug_embedding_path(), 'wb') as f:
        pickle.dump(embeddings_dict, f)
Exemplo n.º 17
0
from fastText import FastText

ft = FastText.load_model("lid.176.ftz")
text = 'OpenAI has moved most of its staff to a for-profit LLC. RIP "open-source". RIP "non-profit".'
labels = ft.predict(text)
print(labels)
Exemplo n.º 18
0
from fastText import FastText
from flask import Flask, request
import re

from loadFromJSONString import loadFromJSONString
from beautifyMessage import beautifyMessage

app = Flask(__name__)
classifier = FastText.load_model('slack_model.bin')
channelToWorkMap = {
    '__label__-00_announcements': True,
    '__label__-01_general': True,
    '__label__-07_community': False,
    '__label__-12_random-fun': False,
    '__label__unrestricted-chat': False
}
logFileName = 'NOTify.log'
isWorkHours = True

with open(logFileName, 'w') as logFile:
    logFile.write('')


def log(msg):
    print(msg)
    with open(logFileName, 'a') as log:
        log.write(msg + '\n')


freeTimeQueue = []
Exemplo n.º 19
0
def load_model(model_path):
    #加载模型
    #加载windows模型
    import fastText.FastText as ff
    classifier = ff.load_model(model_path)
    return classifier
Exemplo n.º 20
0
 def __init__(self, model=MODEL_FILE):
     self.model = FastText.load_model(model)
sum_test = 0  #计算分词后预测数据集的行数
with open(os.path.join(filename, "trainF.tsv"), 'r',
          encoding="utf-8") as f_test:
    for line_test in f_test:
        sum_test += 1

for d_count in range(0, sum_test):
    d.append({})

#导入模型来预测数据集文件标签
for count in range(0, 80):  #此处根据上一步训练所用的分类器更改
    prepare_list = globals()
    if __name__ == '__main__':
        for i in range(1, 6):  #一次导入五个模型
            prepare_list['classisier' +
                         str(i)] = ff.load_model("data/bootstarp/model/model" +
                                                 str(count * 5 + i) + ".model")
    line_nu = 0
    with open(os.path.join(filename, "trainF.tsv"), "r",
              encoding="utf-8") as f_test:
        for line in f_test:
            line = line.strip()
            dict = d[line_nu]
            for i in range(1, 5):
                str_test = prepare_list['classisier' + str(i)].predict(
                    line, k=5, threshold=0.0)
                for j in range(0, 5):
                    # print(str_test[0][j])
                    dict.setdefault(str_test[0][j], 0)
                    dict[str_test[0][j]] += str_test[1][j]
            d[line_nu] = dict
            line_nu += 1
Exemplo n.º 22
0
# _*_coding:utf-8 _*_
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import fastText.FastText as fasttext 
#加载模型
model=fasttext.load_model('models/fasttext_train.model.bin')

labels_right = []
texts = []
labels_predict = []
with open("data/test/fastText_test.txt") as fr:
    for line in fr:
        line = line.decode("utf-8").rstrip()
        label_right=line.split("\t")[1]
        labels_right.append(label_right)
        text=line.split("\t")[0]
        texts.append(text)
        label_predict=model.predict(text)
        labels_predict.append(label_predict[0])
        print ("文本: ")
        print (line)
        print ("真实label: ")
        print (label_right)
        print ("预测label: ")
        print (label_predict[0])


predict_labels=[]
for predict_label in labels_predict:
    predict_labels.append(predict_label[0])
Exemplo n.º 23
0
#作者:肖劲宇 张红轩
#用途:进行意图分析
#时间:2019.6.26

import fastText.FastText as ff
import jieba

classifier = ff.load_model('data/try.model')
labels_right = []
texts = []
with open("data/test.txt") as fr:
    for line in fr:
        line = line.decode("utf-8").rstrip()
        labels_right.append(line.split("\t")[1].replace("__label__", ""))
        texts.append(line.split("\t")[0])

labels_predict = [e[0] for e in classifier.predict(texts)]  #预测输出结果为二维形式
# print labels_predict
text_labels = list(set(labels_right))
text_predict_labels = list(set(labels_predict))
print(text_predict_labels)
print(text_labels)

A = dict.fromkeys(text_labels, 0)  #预测正确的各个类的数目
B = dict.fromkeys(text_labels, 0)  #测试数据集中各个类的数目
C = dict.fromkeys(text_predict_labels, 0)  #预测结果中各个类的数目
for i in range(0, len(labels_right)):
    B[labels_right[i]] += 1
    C[labels_predict[i]] += 1
    if labels_right[i] == labels_predict[i]:
        A[labels_right[i]] += 1
Exemplo n.º 24
0
#!/usr/bin/python
from fastText import FastText

model = FastText.load_model('./model/final_ns.bin')


def validate_keyword(key):
    tup = ()
    words = key.split(' ')

    for word in words:
        tup = tup + model.get_subwords(word)

    tup_list = list(tup)

    for l in tup_list:
        if (type(l) is not list):
            tup_list.remove(l)
    keyword = ''
    for l in tup_list:
        if (len(l) == 0):
            tup_list.remove(l)
        else:
            keyword = keyword + ' ' + str(l[0])
    if (len(keyword) != 0):
        return True
    else:
        return False


def predict_label(key):