def train_model(self): classify_model = None if os.path.exists(self.preTrained_vectors): logging.info("存在预训练的词向量,从本地加载词向量进行训练...") classify_model = fasttext.supervised( self.fasttext_train_file, self.model_file[0:-4], lr=0.1, epoch=100, dim=self.fasttext_dim, bucket=50000000, loss='softmax', thread=56, min_count=3, word_ngrams=4, pretrained_vectors=self.preTrained_vectors, silent=False) else: logging.info("不存在预训练的词向量,重头开始训练...") classify_model = fasttext.supervised(self.fasttext_train_file, self.model_file[0:-4], lr=0.1, epoch=100, dim=self.fasttext_dim, bucket=50000000, loss='softmax', thread=56, min_count=3, word_ngrams=4, silent=False) return classify_model
def train(attrs, path): for attr in attrs: print('开始训练---', attr) train_file = path + 'train-cut/' + attr + '_train.txt' model = path + 'model-cut/' + attr + '_model' fasttext.supervised(train_file, model, label_prefix='__label__') print('训练完成---', attr)
def train(): fasttext.supervised("/tmp/xinlang.train", "/tmp/xinlang.model", label_prefix="__label__", lr=1, dim=200, word_ngrams=2, bucket=10000000, epoch=20)
def train_fasttext(data_path="./data/question/disease", model_path="./data/fasttext.model"): """ This function is used to train the fasttext classifier :param data_path: string, the path of training data. :param model_path: string, the path to save the trained model of fasttext. :return: """ path = preprocess.generate_train_text(data_path) fasttext.supervised(path, model_path, label_prefix="__label__")
def train(self, input_file, output_file): # TODO: access label prefix info from create_training_data.py # or create new sub-command to create training data fasttext.supervised(input_file, output_file, label_prefix='__LABEL__', dim=300, min_count=1, thread=2, silent=0)
def train(self): print('--- starting training ---') if os.path.exists(self.pretrained_vec_file_name): print('found pretrained word vector') fasttext.supervised( self.training_file, self.model_file, pretrained_vectors=self.pretrained_vec_file_name, dim=self.dim) else: fasttext.supervised(self.training_file, self.model_file) print('--- fininshed training ---')
def train(data, output): """Train Rojak""" # TODO: access label prefix info from create_training_data.py # or create new sub-command to create training data # TODO: access training fasttext model using class wrapper fasttext.supervised(data, output, label_prefix='__LABEL__', dim=300, min_count=1, thread=2, silent=0)
def fit(self, features, labels): """Trains the fasttext classifier on the provided data and outputs the results.""" store_data_in_fasttext_file_format( os.path.join(self.model_dir, "train.txt"), features, labels) fasttext.supervised(os.path.join(self.model_dir, "train.txt"), os.path.join(self.model_dir, "cv_model"), label_prefix='__label__', bucket=2000000, epoch=10, dim=300, lr=0.005) self.model = fasttext.load_model( os.path.join(self.model_dir, 'cv_model.bin')) return self
def fit(self): print("Henter inn tekst") self.trainFolder2fasttext() print("Starter trening") if self.wikiVec == True: print("Kjører test med forhåndstrente Embeddings") self.model = fasttext.supervised(input_file=self.tmp_ft_file_path, output='model', epoch=self.epochs, lr=self.learningRate, lr_update_rate=self.lrUpdate, loss=self.lossFunction, ws=self.wordWindow, pretrained_vectors = self.wikiPath) else: self.model = fasttext.supervised(input_file=self.tmp_ft_file_path, output='model', epoch=self.epochs, lr=self.learningRate, lr_update_rate=self.lrUpdate, loss=self.lossFunction, ws=self.wordWindow) os.remove(self.tmp_ft_file_path)
def test_train_classifier(self): # set params dim=10 lr=0.005 epoch=1 min_count=1 word_ngrams=3 bucket=2000000 thread=4 silent=1 label_prefix='__label__' # Train the classifier model = ft.supervised(input_file, output, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, silent=silent, label_prefix=label_prefix) # Make sure the model is generated correctly self.assertEqual(model.dim, dim) self.assertEqual(model.epoch, epoch) self.assertEqual(model.min_count, min_count) self.assertEqual(model.word_ngrams, word_ngrams) self.assertEqual(model.bucket, bucket) # Read labels from the the input_file labels = read_labels_from_input(input_file, label_prefix) # Make sure labels are loaded correctly self.assertTrue(sorted(model.labels) == sorted(labels)) # Make sure .bin and .vec are generated self.assertTrue(path.isfile(output + '.bin'))
def train_model(lines, filename='/tmp/model.train', output='model/model', dim=100, lr=0.1, epoch=6, min_count=1, word_ngrams=1, bucket=1000000, thread=4, silent=1, label_prefix='__label__', remove_after=False): save_file(lines, filename) mkdir_p(os.path.dirname(output)) classifier = ft.supervised(filename, output, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, silent=silent, label_prefix=label_prefix) if remove_after: os.remove(filename) os.remove(output + '.bin') return classifier
def fast_text(tweets, test_tweets): """ DESCRIPTION: Applies FastText Algorithm INPUT: tweets: Dataframe of train tweets test_tweets: Dataframe of test tweets OUTPUT: labels: list of predicted labels of 1 or -1 """ tweets['sentiment'] = change_label(tweets['sentiment']) write_tweets_with_fasttext_labels(tweets) classifier = fasttext.supervised(FASTTEXT_TRAIN_FILE, FASTTEXT_MODEL,\ label_prefix='__label__',\ epoch=algorithm['params']['epochs'],\ dim=algorithm['params']['we_features'],\ ws = algorithm['params']['window_size'],\ lr = algorithm['params']['learning_rate']) test_tweets = transform_test_tweets(test_tweets) labels = classifier.predict(test_tweets) labels = transform_labels(labels) return labels
def train_fasttext(train_file): logging.info("start training FT model...") temp_ft_model = fasttext.supervised(train_file, TEMP_FT_FILE, label_prefix='__label__') logging.info('training ft finished!') return temp_ft_model
def train(self, txt_path, config=DEF_CONFIG): if self.mode == "skipgram": self.model = fasttext.skipgram(txt_path, self.model_path, **config) elif self.mode == "cbow": self.model = fasttext.cbow(txt_path, self.model_path, **config) elif self.mode == "supervised": self.model = fasttext.supervised(txt_path, self.model_path, **config)
def _trainModel(self,train_x,train_y,tag_level): if not self._loadConfig(): sys.exit(1) #create tmp fasttext train file from sklearn train file train_file_name = convertSkTrainFileToFastTextFile(train_x,train_y,self._model_path,self._label_prefix) if train_file_name is None: print 'convert train file fail' sys.exit(1) #return False try: if '1' == tag_level: model_name = self._level1_tag_model_name elif '2' == tag_level: model_name = self._level2_tag_model_name else: print 'Error tag_level: ' + tag_level sys.exit(1) self._fasttext = fasttext.supervised(train_file_name,model_name, label_prefix = self._label_prefix) except Exception,e: print 'train fasttext model fail. ' + str(e) sys.exit(1)
def train_model(tdata, model): classifier = fasttext.supervised(tdata + '.train', model) result = classifier.test(tdata + '.test') print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) return result
def train(data): x_test = data[2] y_test = data[3] clf = fasttext.supervised('data/train_ft.txt', 'model', dim=256, ws=5, neg=5, epoch=100, min_count=10, lr=0.1, lr_update_rate=1000, bucket=200000) # 我们用predict来给出判断 labels = clf.predict(x_test) y_preds = np.array(labels).flatten().astype(int) # 我们来看看 print(len(y_test)) print(y_test) print(len(y_preds)) print(y_preds) # AUC准确率 fpr, tpr, thresholds = metrics.roc_curve(y_test, y_preds, pos_label=1) print(metrics.auc(fpr, tpr))
def train(): classifier = fasttext.supervised('train.txt', 'model', label_prefix='__label__') result = classifier.test('test.txt') print('Precision:', result.precision) print('Recall:', result.recall)
def fit(self, X, y): # Check that X and y have correct shape #X, y = check_X_y(X, y) # Store the classes seen during fit self.classes_ = unique_labels(y) self.X_ = X self.y_ = y input_file = self.store_training_data(X, y) self.output ='/tmp/fast-text-model-%s' % os.getpid() self.model = fasttext.supervised( input_file, self.output, dim=self.dim, lr=self.lr, epoch=self.epoch, min_count=self.min_count, word_ngrams=self.word_ngrams, bucket=self.bucket, thread=self.thread, silent=self.silent, label_prefix=self.label_prefix ) # Clean up the temporary training data file: os.remove(input_file) # Return the classifier return self
def char_main(): fileout_tn = 'data/fasttest_train.txt' fileout_val = 'data/fasttest_val.txt' fileout_ts = 'data/fasttest_ts.txt' # convert_data(fileout_tn, fileout_val, fileout_ts) # print('convert data done.') classifier = fasttext.supervised(fileout_tn, 'fasttextmodel', epoch=50, min_count=10, word_ngrams=4, minn=0, maxn=0, dim=300, ws=5, bucket=2000000) """ 0.9817 epoch=25,min_count= 10, word_ngrams=4, minn=0, maxn=0, dim =500, ws=5, """ result = classifier.test(fileout_val) # print('acc:', result.accuracy) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples)
def classify(data, labels, test, train, validation): train_data = [k for k in data.keys() if k in train] train_labels = [labels[k] for k in train_data] train_data = [data[k] for k in train_data] test_data = [k for k in data.keys() if k in test] test_labels = [labels[k] for k in test_data] test_data = [data[k] for k in test_data] validation_data = [k for k in data.keys() if k in validation] validation_labels = [labels[k] for k in validation_data] validation_data = [data[k] for k in validation_data] save_training_file(train_data, train_labels) cls = fasttext.supervised('training.txt', 'model', lr_update_rate=200000, epoch=10, lr=0.3) predicted = [int(x[0]) for x in cls.predict(validation_data)] remove_training_file() precision, recall, f1, _ = precision_recall_fscore_support( validation_labels, predicted, average='binary') return { 'accuracy': float("{:.3f}".format(round(precision, 3))), 'recall': float("{:.3f}".format(round(recall, 3))), 'f1': float("{:.3f}".format(round(f1, 3))) }
def url_classification_fasttext(): url_cat, url_content = text_utility.get_documents(current_path="data", pattern="train*.xlsx") with open("fasttext_train_nocutall.txt", mode="w") as fd: format_data(url_cat, url_content, fd) fd.flush() classifier = fasttext.supervised(input_file="fasttext_train_nocutall.txt", output="fasttext_nocutall.model", label_prefix="__label__") # classifier = fasttext.load_model("fasttext.model.bin", label_prefix="__label__") print(len(classifier.labels)) for class_name in classifier.labels: print(class_name) texts = list() with open("test.txt", mode="r") as fd: for line in fd: line = line.strip() segs = line.split(',') if len(segs) != 6: continue url, title, keywords, desc, a_content, p_content = line.split(',') content = " ".join([title, keywords, desc, a_content, p_content]) word_vec = [word for word in jieba.cut(content, cut_all=False)] if len(word_vec) == 0: continue test_content = " ".join(word_vec) print(url, test_content) texts.append(test_content) # predict函数的输入需要使用list类型 label_list = classifier.predict_proba(texts, len(classifier.labels)) for label in label_list: for value in label: print(value[0], value[1])
def train(self): """ 训练函数 :return: """ start_time = time.time() traing_file = self.config.train_file save_path = self.config.save_model print( sys.stderr, 'Start training model, training file: %s, saved model path: %s.' % (traing_file, save_path)) classifier = fasttext.supervised(traing_file, save_path, label_prefix='__label__', dim=100, word_ngrams=2, bucket=2000000, loss='softmax') end_time = time.time() print >> sys.stderr, 'Training over. cost %.2fs' % (end_time - start_time) return classifier
def main(): trainDataPath = "/home/singh/Desktop/emocontext/starterkitdata/train.txt" testDataPath = "/home/singh/Desktop/emocontext/starterkitdata/devwithoutlabels.txt" solutionPath = "/home/singh/Desktop/emocontext/fast_text/fast_text/test3.txt" print("Processing training data...") #data_train = preprocessDatatrain(trainDataPath, mode="train") print("Processing test data...") #data_test = preprocessDatatest(testDataPath, mode="test") list1 = preprocessDatalist(testDataPath, mode="test") classifier = fasttext.supervised( input_file="fasttext_dataset_training.txt", output='model/model3', dim=300, lr=0.01, epoch=30) labels = classifier.predict(list1) #print(list1[:5], labels[:5]) with io.open(solutionPath, "w", encoding="utf8") as fout: fout.write('\t'.join(["id", "turn1", "turn2", "turn3", "label"]) + '\n') with io.open(testDataPath, encoding="utf8") as fin: fin.readline() for lineNum, line in enumerate(fin): fout.write('\t'.join(line.strip().split('\t')[:4]) + '\t') fout.write(label2emotion[int(labels[lineNum][0])] + '\n')
def train(self, train, test, model, **kwargs): params = { 'dim': [100, 150], 'lr': [0.1, 0.5, 1], 'loss': ['ns', 'hs'], 'ws': [5, 10] } for k, v in kwargs.items(): params[k] = v keys, values = params.keys(), params.values() best, best_score = '', 0 for p in product(*values): ps = {keys[i]: p[i] for i in xrange(4)} clf = fasttext.supervised( train, model + '%s_%s_%s_%s' % (p[0], p[1], p[2], p[3]), **ps) result = clf.test(test) print '%s_%s_%s_%s' % (p[0], p[1], p[2], p[3]) print 'Precision: %.2f%%' % (result.precision * 100) print 'Recall Rate: %.2f%%\n' % (result.recall * 100) f1 = float(2.0 * result.precision * result.recall) / float(result.precision + result.recall) if best_score < f1: best, best_score, = '%s_%s_%s_%s' % (p[0], p[1], p[2], p[3]), f1 print '%s\n%.2f' % (best, best_score)
def train(): classifier = fasttext.supervised('train80ft_mecab.txt', 'classify_title', dim=100, epoch=50, bucket=200000, word_ngrams=2)
def train_model(train_data_path, test_data_path, model_save_path): t1 = time.time() classifier = fasttext.supervised(train_data_path, model_save_path, label_prefix="__label__") t2 = time.time() print('train model over. it took {0}ms'.format((t2 - t1))) result = classifier.test(test_data_path) print("P@1:", result.precision) # 准确率 print("R@2:", result.recall) # 召回率 print("Number of examples:", result.nexamples) # 预测的样本数量 # 预测测试数据 y_true, y_pred = [], [] with open(test_data_path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() parts = line.split(' , ') if len(parts) != 2: continue cls, txt = parts[0], parts[1] prediction = classifier.predict([txt]) y_pred.append(prediction[0][0]) y_true.append(cls.replace('__label__', '').strip()) # 输出各类别测试测试参数 print(y_true[:10], y_pred[:10]) classify_report = metrics.classification_report(y_true, y_pred) print(classify_report)
def train(self): classifier = fasttext.supervised('data/train_fasttext.txt', 'save_model/fast_text/fasttext_Model', label_prefix='__label__') result = classifier.test('data/train_fasttext.txt') print("pre:" + str(result.precision)) print("recall:" + str(result.recall))
def execute(): # Verify that mandatory arguments are present if "-i" not in args: return "ERROR: No input file was given" if "-t" not in args: return "ERROR: No model type was given" # Extract arguments train_file = args[args.index("-i")+1] model_type = args[args.index("-t")+1] # Extract optional arguments epoch = get_optional_param('--epoch',5) ngrams = get_optional_param('--ngrams',1) label_prefix = get_optional_param('--label','__label__') # Create temporary file tmp, modelname = tempfile.mkstemp() # Use specified classifier with parameters and output model to the name of the temporary file if model_type == "supervised": classifier = fasttext.supervised(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "skipgram": classifier = fasttext.skipgram(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) elif model_type == "cbow": classifier = fasttext.cbow(train_file, modelname, epoch=epoch, word_ngrams=ngrams, label_prefix=label_prefix) # Return the temporary file name return modelname
def fast_cv(df): X = df['Discuss'].values y = df['Score'].values fast_pred = [] folds = list(KFold(n_splits=5, shuffle=True, random_state=2018).split(X, y)) for train_index, test_index in folds: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] train_file = fasttext_data(X_train, y_train) classifier = fasttext.supervised(train_file, '../input/model', lr=0.01, dim=128, label_prefix="__label__", encoding = 'utf-8-sig') result = classifier.predict_proba(df.loc[test_index, 'Discuss'].tolist(), k=5) print(result[0:100]) pred = [[int(sco) * proba for sco, proba in result_i] for result_i in result] pred = [sum(pred_i) for pred_i in pred] print(pred[0:100]) print(rmsel(y_test, pred)) test_result = classifier.predict_proba(test_df['Discuss'].tolist(), k=5) fast_predi = [[int(sco) * proba for sco, proba in result_i] for result_i in test_result] fast_predi = [sum(pred_i) for pred_i in fast_predi] fast_pred.append(fast_predi) fast_pred = np.array(fast_pred) fast_pred = np.mean(fast_pred, axis=0) return fast_pred
def classify(**kvargs): classifier = ft.supervised("news_fasttext_train.txt", "news_fasttext.model", label_prefix="__label__") # classifier = ft.load_model( # 'news_fasttext.model.bin', label_prefix='__label__') news = News.objects.all()[:20] for new in news: text = new.n_abs seg_text = jieba.cut(text.replace("\t", " ").replace("\n", " ")) outline = " ".join(seg_text) texts = [outline.encode("utf8")] labels = classifier.predict(texts) print text + ":" + labels[0][0]
def train(cls, input_file, output, **kwargs): """ 模型训练 * input_file training file path (required) * output output file path (required) * label_prefix label prefix ['__label__'] * lr learning rate [0.1] * lr_update_rate change the rate of updates for the learning rate [100] * dim size of word vectors [100] * ws size of the context window [5] * epoch number of epochs [5] * min_count minimal number of word occurences [1] * neg number of negatives sampled [5] * word_ngrams max length of word ngram [1] * loss loss function {ns, hs, softmax} [softmax] * bucket number of buckets [0] * minn min length of char ngram [0] * maxn max length of char ngram [0] * thread number of threads [12] * t sampling threshold [0.0001] * silent disable the log output from the C++ extension [1] * encoding specify input_file encoding [utf-8] * pretrained_vectors pretrained word vectors (.vec file) for supervised learning [] """ config = get_config() kwargs.setdefault('lr', config.get('model', 'lr')) kwargs.setdefault('lr_update_rate', config.get('model', 'lr_update_rate')) kwargs.setdefault('dim', config.get('model', 'dim')) kwargs.setdefault('ws', config.get('model', 'ws')) kwargs.setdefault('epoch', config.get('model', 'epoch')) kwargs.setdefault('word_ngrams', config.get('model', 'word_ngrams')) kwargs.setdefault('loss', config.get('model', 'loss')) kwargs.setdefault('bucket', config.get('model', 'bucket')) kwargs.setdefault('thread', config.get('model', 'thread')) kwargs.setdefault('silent', config.get('model', 'silent')) cls.__model = ft.supervised(input_file, output, **kwargs) return cls.__model
def main(): data_path = '/Users/ruizhang/Documents/NLP_dataset/' ############# # ############ # Load train set train_file = data_path +'dbpedia_csv/train.csv' df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'dbpedia_csv/test.csv' df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Mapping from class number to class name class_dict = { 1: 'Company', 2: 'EducationalInstitution', 3: 'Artist', 4: 'Athlete', 5: 'OfficeHolder', 6: 'MeanOfTransportation', 7: 'Building', 8: 'NaturalPlace', 9: 'Village', 10: 'Animal', 11: 'Plant', 12: 'Album', 13: 'Film', 14: 'WrittenWork' } df['class_name'] = df['class'].map(class_dict) df.head() ############# # ############ desc = df.groupby('class') desc.describe().transpose() # Transform datasets df_train_clean = clean_dataset(df, True, False) df_test_clean = clean_dataset(df_test, False, False) # Write files to disk train_file_clean = data_path + 'dbpedia.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'dbpedia.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) # Train a classifier output_file = data_path + 'dp_model' classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__') result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.'] labels1 = classifier.predict(sentence1) class1 = int(labels1[0][0]) print("Sentence: ", sentence1[0]) print("Label: %d; label name: %s" % (class1, class_dict[class1])) sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.'] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class2 as string print("Sentence: ", sentence2[0]) print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!'] number_responses = 3 labels3 = classifier.predict_proba(sentence3, k=number_responses) print("Sentence: ", sentence3[0]) for l in range(number_responses): class3, prob3 = labels3[0][l] print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3)) # Load train set train_file = data_path + 'amazon_review_polarity_train.csv' df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'amazon_review_polarity_test.csv' df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Transform datasets df_train_clean = clean_dataset(df_sentiment_train, True, False) df_test_clean = clean_dataset(df_sentiment_test, False, False) # Write files to disk train_file_clean = data_path + 'amazon.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'amazon.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) dim = 10 lr = 0.1 epoch = 5 min_count = 1 word_ngrams = 2 bucket = 10000000 thread = 12 label_prefix = '__label__' # Train a classifier output_file = data_path + 'amazon_model' classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, label_prefix=label_prefix) # Evaluate classifier result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) class_dict = { 1: "Negative", 2: "Positive" } sentence1 = ["The product design is nice but it's working as expected"] labels1 = classifier.predict_proba(sentence1) class1, prob1 = labels1[0][0] # it returns class as string print("Sentence: ", sentence1[0]) # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1)) sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class as string print("Sentence: ", sentence2[0]) # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) url = "https://twitter.com/miguelgfierro/status/805827479139192832" response = urlopen(url).read() title = str(response).split('<title>')[1].split('</title>')[0] print(title) # # Format tweet # tweet = unescape(title) # print(tweet) # # # Classify tweet # label_tweet = classifier.predict_proba([tweet]) # class_tweet, prob_tweet = label_tweet[0][0] # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet)) wiki_dataset_original = data_path + 'enwik9' wiki_dataset = data_path + 'text9' if not os.path.isfile(wiki_dataset): os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset) output_skipgram = data_path + 'skipgram' if os.path.isfile(output_skipgram + '.bin'): skipgram = fasttext.load_model(output_skipgram + '.bin') else: skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) print(np.asarray(skipgram['king'])) print("Number of words in the model: ", len(skipgram.words)) # Get the vector of some word Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum() print(Droyals) Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople) Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople2) print(len(skipgram.words)) targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother', 'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy'] classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] X_target = [] for w in targets: X_target.append(skipgram[w]) X_target = np.asarray(X_target) word_list = list(skipgram.words)[:10000] X_subset = [] for w in word_list: X_subset.append(skipgram[w]) X_subset = np.asarray(X_subset) X_target = np.concatenate((X_subset, X_target)) print(X_target.shape) X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact', random_state=0, n_iter=200, verbose=2).fit_transform(X_target) print(X_tsne.shape) X_tsne_target = X_tsne[-20:, :] print(X_tsne_target.shape) plot_words(X_tsne_target, targets, classes=classes) plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
train_file = '/home/alex/data/hsbianma_page10_sogou_huaxue.csv' train_file_output = '/home/alex/data/hsbianma_page10_sogou_huaxue.fasttext.txt' def format_train_file(csv_file, train_file): with open(csv_file) as r, open(train_file, 'w') as w: csv_r = csv.DictReader(r, fieldnames=('kind', 'content')) for row in csv_r: row['content'] = row['content'].strip() if "\n" in row['content']: continue words = jieba.cut(row['content']) w.write('__label__' + row['kind'][:4] + ' ' + ' '.join(words) + "\n") # 格式化训练文件 format_train_file(train_file, train_file_output) # 训练 classifier = fasttext.supervised(train_file_output, 'classify_model') # 测试 result = classifier.test(train_file_output) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # 预测 classifier.predict([' '.join(jieba.cut("锦纶天丝面料"))], k=3)
import fasttext import fasttext # 训练模型 classifier = fasttext.supervised("train.txt", "model", label_prefix="__label__", dim=100, epoch=2, word_ngrams=1, min_count=1, lr=0.1, bucket=200000)
# -*- coding: utf-8 -*- # # # Author: alex # Created Time: 2017年09月10日 星期日 13时18分07秒 import csv import jieba import fasttext train_file = 'fasttext.train.txt' test_file = 'fasttext.test.txt' # 训练 classifier = fasttext.supervised(train_file, 'classify_model', lr=1.0, epoch=30, #word_ngrams=2 loss='hs' ) # 测试 result = classifier.test(train_file) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) # 测试 result = classifier.test(test_file) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples)