class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize = False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized = self.normalize) def search(self, q, topk = 4): raise NotImplementedError() def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def containsWord(self, word): return True def vocabulary(self): return self.ftmodel.words def dim(self): return self.vdim
def predict(self, test_set, test_labels_vector=None, report_accuracy=True): """ uses the trained model to predict the test set :param test_set: the test set :param test_labels_vector: the labels vector of the test set for accuracy computation :param report_accuracy: defines whether to report the prediction or not """ if self.model_name: from pyfasttext import FastText predictor = FastText() predictor.load_model('ft_extras/'+self.model_name+'.bin') predicted_labels = predictor.predict_proba(test_set) if report_accuracy and test_labels_vector: test_set_size = len(test_set) correct_predictions = 0 invalid_labels = 0 for index, labels in enumerate(predicted_labels): if len(labels) != 0: best_label = max(labels,key=lambda label:label[1]) if best_label[0] == test_labels_vector[index]: correct_predictions += 1 else: invalid_labels += 1 continue print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels))) else: print('Please use the train method to train a model first.') return
def get_language(text): """Given a list of lines, return a list of (line, lang)""" if not hasattr(settings, '_lang_detector'): lid_model = FastText() lid_model.load_model(settings.LID_MODEL_PATH) settings._lang_detector = lid_model langs = settings._lang_detector.predict([text]) return langs[0]
def pyfasttext_sample(): """https://pypi.org/project/pyfasttext/ """ model = FastText() # model.load_model('output/model_cooking_6.bin') model.load_model('output/model_cooking_5.ftz') result = model.predict_file('data/cooking/pre_cooking.valid', 2) for i, r in enumerate(result): print(i, r)
def init(): global processtext processtext = ProcessText() global labels_list with open("both_labels.pkl", "rb") as f: labels_list = pickle.load(f) global contcmp contcmp = ContCmp("root_feature_file.allid") #loadModel() global fasttext_model fasttext_model = FastText() fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
def validate_model(model_file, validate_file): model = FastText() model.load_model(model_file + '.bin') validate = pd.read_csv(validate_file, sep='\t', names=['label', 'review'], lineterminator='\n') validate['predict'] = validate.apply( lambda row: predict_label(model, row['review']), axis=1) # all_all_count = validate.shape[0] good_good = validate.query( " predict == 'GOOD' and label == '__label__GOOD' ").shape[0] bad_bad = validate.query( " predict == 'BAD' and label == '__label__BAD' ").shape[0] # neutral_neutral = validate.query(" predict == 'NEUTRAL' and label == '__label__NEUTRAL' ").shape[0] all_good = validate.query(" label == '__label__GOOD' ").shape[0] all_bad = validate.query(" label == '__label__BAD' ").shape[0] # all_neutral = validate.query(" label == '__label__NEUTRAL' ").shape[0] good_all = validate.query(" predict == 'GOOD' ").shape[0] bad_all = validate.query(" predict == 'BAD' ").shape[0] # neutral_all = validate.query(" predict == 'NEUTRAL' ").shape[0] # micro_precision = (good_good + bad_bad + neutral_neutral)/all_all_count # micro_recall = micro_precision # micro_f1 = micro_precision micro_precision = good_good / good_all micro_recall = good_good / all_good micro_f1 = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall) # macro_precision = ((good_good/good_all) + (bad_bad/bad_all) + (neutral_neutral/neutral_all))/3 # macro_recall = ((good_good/all_good) + (bad_bad/all_bad) + (neutral_neutral/all_neutral))/3 macro_precision = ((good_good / good_all) + (bad_bad / bad_all)) / 2 macro_recall = ((good_good / all_good) + (bad_bad / all_bad)) / 2 macro_f1 = (2 * macro_precision * macro_recall) / (macro_precision + macro_recall) return micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1
class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize=False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized=self.normalize) def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def nearest_neighbors(self, word, n=200): tuples = ftmodel.nearest_neighbors(word, n) return tuples def nearest_neighbors_by_vector(self, word, n=200): tuples = self.ftmodel.words_for_vector(v, n) return tuples def containsWord(self, word, explicit=False): if explicit: return word in vocabulary() return True def vocabulary(self): return self.ftmodel.words def dim(self): return self.vdim
business_review_score = business_raw.merge(review_processed_short, on=['business_id']) business_review_score = business_review_score[[ 'business_id', 'name', 'city', 'state', 'stars', 'review_count', 'categories', 'review_stars', 'text', 'predict_score' ]] business_review_score_short = business_review_score.query( " stars in ('1', '5')") business_review_score_short.to_csv(const.business_review_score_file_name, sep='\t', index=False, header=False) logger.info('end process:' + sys._getframe().f_code.co_name) if __name__ == '__main__': if len(sys.argv) <= 1: raise Exception( 'Need to run command as python assign.py [path_to_model_bin_file]') else: model_file = sys.argv[1] model = FastText() model.load_model(model_file) assign_score_to_review(model) assign_score_to_review_with_text()
import math import json import string N_GRAM = 3 WORD_SIZE = 6 VECTOR_NUM = 300 MODEL_PATH = 'model.bin' # Gordugu karakterleri digeriyle degistiriyor TO_LOWER = str.maketrans('ABCÇDEFGĞHIIJKLMNOÖPQRSŞTUÜVWXYZ', 'abcçdefgğhıijklmnoöpqrsştuüvwxyz', '’“”') # Model Dosyamız yükleniyor. print('Vektörlerimiz Yükleniyor... Lütfen Bekleyiniz...') model = FastText() model.load_model(MODEL_PATH) # def load_model(): # model = FastText() # model.load_model(MODEL_PATH) # return model # kelime vektorunden cumle vektorune def sent2Vec(sentence): sentenceVector = [0.0] * VECTOR_NUM split_sentence = sentence.split() for word in split_sentence: wordVector = model[word] for i in range(VECTOR_NUM): sentenceVector[i] += wordVector[i]
def loadModel(): global fasttext_model fasttext_model = FastText() fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
from sentiment_analysis import load_sentiment_model, load_tagging_model, sentiment_analysis from mem_absa.load_data import init_word_embeddings from mem_absa.load_data import read_sample from mem_absa.mapping import mapping_sentiments import configuration as configuration import mysql.connector import spacy from pyfasttext import FastText fr_nlp = spacy.load("fr") wiki_model = FastText() wiki_model.load_model(configuration.pathFasttext) # connection à la BD conn = mysql.connector.connect(host="localhost", user="******", password="******", database="resolution") def generate_review_data(): cursor = conn.cursor() cursor.execute( "select distinct code_etab from resolution.contributions_etab_nihel where code_an8=54053000 limit 300" ) etabs = cursor.fetchall() data = []
import jieba SERVICE_URL_SS = "http://127.0.0.1:8001/service" SERVICE_URL_ZD2 = "http://127.0.0.1:8002/service" #大粒度 SERVICE_URL_ZD = "http://127.0.0.1:8006/service" #小粒度 SERVICE_URL_ZD_SEG = "http://127.0.0.1:8006/seg" #小粒度 # SERVICE_URL_SS = "http://172.19.91.91:8001/service" # SERVICE_URL_ZD = "http://172.19.91.91:8002/service" HEADERS = {'content-type': 'application/json'} #bcjl_model:分词粒度大(如 肾积水),bcjl_model1:分词粒度小(如 肾/积水) ft_model=FastText() ft_model.load_model('../../model/bcjl_model.bin') # ft_model="" SYN_THRESHOLD=0.75 SIMILARITY_THRESHOLD=0.8 conn = MongoClient('localhost', 27017) db = conn.bzxt1 def file_compare(f1_path,f2_path): # 比较两个文件是否相同 def get_file_md5(f): m = hashlib.md5() while True: #如果不用二进制打开文件,则需要先编码 #data = f.read(1024).encode('utf-8')
import re from mem_absa.load_data import init_word_embeddings from mem_absa.load_data import read_sample, read_vocabulary from mem_absa.mapping import mapping_sentiments from mem_absa.model import MemN2N from mem_absa.config_mem import Configure import spacy fr_nlp = spacy.load("fr") path = ".." configure = Configure() FLAGS = configure.get_flags(path) from pyfasttext import FastText wiki_model = FastText() wiki_model.load_model(FLAGS.pathFasttext) def main(_): configure.pp.pprint(FLAGS.__flags) source_count = [] source_word2idx = {} read_vocabulary(fr_nlp, FLAGS.train_data, source_count, source_word2idx) print('loading pre-trained word vectors...') FLAGS.pre_trained_context_wt = init_word_embeddings( wiki_model, source_word2idx, FLAGS.nbwords) FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0 model = MemN2N(FLAGS)
from commons.logger import logger, configure from commons import configuration import configuration as confi from sentiment_analysis import load_sentiment_model, load_tagging_model, sentiment_analysis import spacy from pyfasttext import FastText # fr dictionary loading fr_nlp = spacy.load("fr") wiki_model = FastText() wiki_model.load_model(confi.pathFasttext) import traceback # Chargement de la conf conf = configuration.load() script_dir = os.path.dirname(__file__) def _init_app(p_conf): # Configuration du logger configure(conf['log']['level_values'][conf['log']['level']], conf['log']['dir'], conf['log']['filename'], conf['log']['max_filesize'], conf['log']['max_files']) # Load app config into Flask WSGI running instance
import numpy as np import os from pyfasttext import FastText import string import re # shared global variables to be imported from model also UNK = "$UNK$" NUM = "$NUM$" NONE = "O" path = '/Users/nkooli/Documents/docs/avis/review_analysis_pj/mem_absa' wiki_model = FastText() wiki_model.load_model(path + '/model_pyfasttext100.bin') # special error message class MyIOError(Exception): def __init__(self, filename): # custom error message message = """ ERROR: Unable to locate file {}. FIX: Have you tried running python build_data.py first? This will build vocab file from your train, test and dev sets and trimm your word vectors. """.format(filename) super(MyIOError, self).__init__(message) class CoNLLDataset(object):