def test_load_skipgram_model(self): model = ft.load_model(skipgram_file, encoding='utf-8') # Make sure the model is returned correctly self.assertEqual(model.model_name, 'skipgram') # Make sure all params loaded correctly # see Makefile on target test-skipgram for the params self.assertEqual(model.dim, 100) self.assertEqual(model.ws, 5) self.assertEqual(model.epoch, 1) self.assertEqual(model.min_count, 1) self.assertEqual(model.neg, 5) self.assertEqual(model.loss_name, 'ns') self.assertEqual(model.bucket, 2000000) self.assertEqual(model.minn, 3) self.assertEqual(model.maxn, 6) self.assertEqual(model.lr_update_rate, 100) self.assertEqual(model.t, 1e-4) # Make sure the vector have the right dimension self.assertEqual(len(model['the']), model.dim) # Make sure we support unicode character unicode_str = 'Καλημέρα' self.assertTrue(unicode_str in model.words) self.assertEqual(len(model[unicode_str]), model.dim)
def embed_titles(infile=TOPICS, outfile=EMBEDDED_TOPIC_TITLES_PATH): ''' Vectorize topics with fasttext and save to file ntopic=188, embeddings_dim=100 ''' with open(infile, "r") as f: topics_json = json.load(f) topic_titles = [] topic_vectors = [] model = fasttext.load_model(EMBEDDINGS_MODEL_PATH) for i, topic in enumerate(topics_json): title = topic['title'] # title = topic['title'] # title = topic['title'] # print title topic_titles.append(title) # embed titles with fasttext topic_vectors.append(model[title]) assert len(topic_vectors) == len(topic_titles) with open(outfile, 'w') as f: pickle.dump(topic_vectors, f) return topic_titles, topic_vectors
def embed_topics(infile=TOPICS, outfile=EMBEDDED_TOPICS_PATH): ''' Vectorize topics with fasttext and save to file ntopic=188, embeddings_dim=100 ''' with open(infile, "r") as f: topics_json = json.load(f) topic_vectors = [] model = fasttext.load_model(EMBEDDINGS_MODEL_PATH) for i, topic in enumerate(topics_json): title = topic['title'] description = topic['title'] narrative = topic['title'] joint = " ".join([title, description, narrative]) # embed topics with fasttext topic_vectors.append(model[joint]) assert len(topic_vectors) == len(topics_json) with open(outfile, 'w') as f: pickle.dump(topic_vectors, f)
def load_model(cls): """ 模型加载 """ config = get_config() model_path = '{}.bin'.format(config.get('train', 'model_path')) if os.path.exists(model_path): cls.__model = ft.load_model(model_path)
def __init__(self, model_path=EMBEDDINGS_MODEL_PATH, embeddings_path=EMBEDDED_TOPICS_PATH): # load fasttext model self.model = fasttext.load_model(model_path) # load topic titles self.topic_titles = load_titles() # load topic embeddings with open(embeddings_path, 'r') as f: self.topic_vectors = pickle.load(f)
def load(self): if self.flavor == 'w2v': self.model = Word2Vec.load(self.path) self.model.init_sims(replace=True) self.size = self.model.size elif self.flavor == 'ft': self.model = fasttext.load_model(self.path + '.bin') self.size = self.model.dim self.fitted = True
def loadfromfile(cls, repdir, word_model_file): ont = pickle.load(open(repdir+'/ont.pickle',"rb" )) class Config(object): def __init__(self, d): self.__dict__ = d config = Config(json.load(open(repdir+'/config.json', 'r'))) #config = Config #config.__dict__ = json.load(open(repdir+'/config.json', 'r')) word_model = fasttext.load_model(word_model_file) model = cls(config, ont, word_model) model.load_params(repdir) return model
def load(self, *args, **kwargs): """ Load dict of embeddings from file Args: fname: file name """ if self.load_path: if self.load_path.is_file(): print("[loading embeddings from `{}`]".format(self.load_path)) model_file = str(self.load_path) if self.emb_module == 'fasttext': import fasttext as Fasttext # model = Fasttext.load_model(model_file) model = Fasttext.load_model(model_file) elif self.emb_module == 'pyfasttext': from pyfasttext import FastText as Fasttext model = Fasttext(model_file) else: from gensim.models.wrappers.fasttext import FastText as Fasttext model = Fasttext.load_fasttext_format(model_file) elif isinstance(self.load_path, Path): raise ConfigError("Provided `load_path` for {} doesn't exist!".format( self.__class__.__name__)) else: warn("No `load_path` is provided for {}".format(self.__class__.__name__)) if self.embedding_url: try: print('[trying to download a pretrained fasttext model from repository]') local_filename, _ = urllib.request.urlretrieve(self.embedding_url) with open(local_filename, 'rb') as fin: model_file = fin.read() mp = self.save_path self.load_path = self.save_path model = self.load() print("[saving downloaded fasttext model to {}]".format(mp)) with open(str(mp), 'wb') as fout: fout.write(model_file) except Exception as e: raise RuntimeError( 'Looks like the provided fasttext url is incorrect', e) else: raise FileNotFoundError( 'No pretrained fasttext model provided or provided "load_path" is incorrect.' ' Please include "load_path" to json.') return model
def test_load_cbow_model(self): model = ft.load_model(cbow_file) # Make sure the model is returned correctly self.assertEqual(model.model_name, 'cbow') # Make sure all params loaded correctly # see Makefile on target test-cbow for the params self.assertEqual(model.dim, 50) self.assertEqual(model.ws, 5) self.assertEqual(model.epoch, 1) self.assertEqual(model.min_count, 3) self.assertEqual(model.neg, 5) self.assertEqual(model.loss_name, 'ns') self.assertEqual(model.bucket, 2000000) self.assertEqual(model.minn, 3) self.assertEqual(model.maxn, 6) self.assertEqual(model.lr_update_rate, 100) self.assertEqual(model.t, 1e-4) # Make sure the vector have the right dimension self.assertEqual(len(model.get_vector('the')), model.dim)
def __init__(self, model_path): self.model_path = model_path self.model = fasttext.load_model(model_path) self.n_labels = len(self.model.get_labels()) self.max_entropy = -1 * np.log(1.0 / self.n_labels)
def fasttext_predict(model_filepath, test_dataset): classifier = fasttext.load_model(model_filepath, label_prefix='__label__') result = classifier.test(test_dataset) print(result) return result
# -*- coding:utf-8 -*- import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import fasttext from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt #load训练好的模型 classifier = fasttext.load_model('comment_code_fasttext.model.bin', label_prefix='__label__') result = classifier.test("test.txt") print print "precision:", (result.precision) print(result.recall) labels_right = [] texts = [] with open("test.txt") as fr: lines = fr.readlines() for line in lines: if line == '\n': continue labels_right.append( line.split("\t")[1].rstrip().replace("__label__", "").encode('utf-8')) texts.append(line.split("\t")[0].decode("utf-8")) # print labels # print texts # break labels_predict = [e[0].encode('utf-8')
def load(self): if os.path.exists(self.model_path + 'bin'): return fasttext.load_model(self.model_path) else: return None
import numpy as np import tensorflow as tf import fasttext as ft import math import sys from cnn_lstm_crf import CNN_BLSTM_CRF Word2vec = ft.load_model('vi.bin') def make_char_dictionary(data_path, dict_path): ### initialize dictionary set char_dictionary = ['<UNK>', '<PAD>'] ### make character dictionary f = open(data_path, 'r') for row in f: row_split = row[:-1].split(' ') for word in row: for char in word: char_dictionary.append(char) f.close() ### remove duplicate characters char_dictionary = list(set(char_dictionary)) ### save character dictionary f = open(dict_path, 'w') for char in char_dictionary: f.write(char + '\n') f.close() def load_dictionary(dict_path):
#train #classfier=fasttext.supervised('news_fasttext_train.txt','news_fasttext.model',label_prefix='__label__') #test clf = fasttext.load_model('news_fasttext.model.bin',label_prefix = '__label__') rel = clf.test('news_fasttext_test.txt') print(rel.precision) print(rel.recall) ''' #测试 clf = fasttext.load_model('news_fasttext.model.bin') text = [ '最高人民法宣宣布周某某因涉嫌贪污受贿,利用不正当手段为他人谋取各种利益等,判处其无期徒刑,剥夺政治权利终身。', '婚姻大事不必铺张浪费', '小编祝大家新年快乐', '中国大陆多次强调,编排出武力夺取台湾', '它被誉为天下第一果,补益气血,养阴生津,现在吃正应季! 六七月是桃子大量上市的季节,因其色泽红润,肉质鲜美,有个在实验基地里接受治疗的妹子。广受大众的喜爱。’' ] label = clf.predict(text) print(label) ''' #训练词向量 model = fasttext.skipgram('news_fasttext_train.txt','model1')
import re import numpy as np import fasttext model = fasttext.load_model("cc.vi.300.bin") import torch from collections import Counter from typing import List # Load PhoBERT-base in fairseq from fairseq.models.roberta import RobertaModel phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq', checkpoint_file='model.pt') phobert.eval() # disable dropout (or leave in train mode to finetune) # Incorporate the BPE encoder into PhoBERT-base from fairseq.data.encoders.fastbpe import fastBPE from fairseq import options parser = options.get_preprocessing_parser() parser.add_argument('--bpe-codes', type=str, help='path to fastBPE BPE', default="PhoBERT_base_fairseq/bpe.codes") args = parser.parse_args() phobert.bpe = fastBPE(args) def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): """ Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). Args:
def exportByDistance(action, modelFileExtension, modelsFolder, fromYear, toYear, neighborsCount, fasttextPath): """ @param action: @param modelFileExtension: @param modelsFolder: @param fromYear: @param toYear: @param neighborsCount: @param fasttextPath: @return: @rtype: None """ fromYearFilename = fromYear + modelFileExtension toYearFilename = toYear + modelFileExtension modelA = fasttext.load_model(os.path.join(modelsFolder, fromYearFilename)) modelB = fasttext.load_model(os.path.join(modelsFolder, toYearFilename)) clearVectorModelA = {} clearVectorModelB = {} for label in modelA.get_labels(): clearVectorModelA[label] = modelA.get_word_vector(label) for label in modelB.get_labels(): clearVectorModelB[label] = modelB.get_word_vector(label) # alignedEmbeddingsB = vector.alignTwoEmbeddings(clearVectorModelA, clearVectorModelB) results = {} for word in modelA.words: if word in modelB.words: if action == 'getCD': results[word] = vector.getCosineDistance( clearVectorModelA[word], clearVectorModelB[word]) elif action == 'getCS': results[word] = vector.getCosineSimilarity( clearVectorModelA[word], clearVectorModelB[word]) if action == 'getCD': sortedResults = sorted(results.items(), key=lambda x: x[1], reverse=True) elif action == 'getCS': sortedResults = sorted(results.items(), key=lambda x: x[1]) resultsPerPeriod = {} for wordTuple in sortedResults[:50]: word = wordTuple[0] resultsPerPeriod[word] = {} resultsPerPeriod[word][str(fromYear)] = getNeighboursForWord( word, fromYearFilename, modelsFolder, fasttextPath, neighborsCount) resultsPerPeriod[word][str(toYear)] = getNeighboursForWord( word, toYearFilename, modelsFolder, fasttextPath, neighborsCount) # print(resultsPerPeriod) file.exportTextToFile(resultsPerPeriod, './shifts.json', True)
def __init__(self, model_path): super(FeatureGenerator, self).__init__() self.model = ft.load_model(model_path)
def __init__(self, dataset='yelp'): acc_path = 'hoang/acc_' + str(dataset) + '.bin' ppl_path = 'hoang/ppl_' + str(dataset) + '.bin' self.classifier = fasttext.load_model(acc_path) self.ppl_model = kenlm.Model(ppl_path) self.dataset = dataset
INPUT_TXT = '/path/to/file.txt' OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram' OUTPUT_PATH_CBOW = '/tmp/cbow' # Learn the word representation using skipgram model skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print skipgram['word'] # Learn the word representation using cbow model cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) # Get the vector of some word print cbow['word'] # Load pre-trained skipgram model SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin' skipgram = fasttext.load_model(SKIPGRAM_BIN) print skipgram['word'] # Load pre-trained cbow model CBOW_BIN = OUTPUT_PATH_CBOW + '.bin' cbow = fasttext.load_model(CBOW_BIN) print cbow['word']
import json import pickle import fasttext from elasticsearch import Elasticsearch from kafka import KafkaConsumer from tunga.preprocessing import normalization import spacy es = Elasticsearch() model = fasttext.load_model("/home/burak/Desktop/cc.en.300.bin") rfc = pickle.load(open("../random_forest.model", 'rb')) nlp = spacy.load('../../../data/buyuk') consumer = KafkaConsumer('crawled_tweets', bootstrap_servers=['localhost:9091']) print(consumer) i = 1 for message in consumer: data = json.loads(message.value) tweet_text = data["tweet"] tweet_text = tweet_text.lower() tweet_text = tweet_text.strip() tweet_text = tweet_text.replace("\n", " ") tweet_text = normalization.remove_url(tweet_text) tweet_text = normalization.remove_hashtag(tweet_text) tweet_text = normalization.remove_emojis(tweet_text) prediction = rfc.predict([model.get_sentence_vector(tweet_text)])[0] if prediction == 0:
def __init__(self, path_fasttext_model): self.fasttext_model = fasttext.load_model(path_fasttext_model)
import numpy as np from sklearn.metrics.pairwise import cosine_similarity import textrank import random import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import fasttext import fasttext.util import pickle import os #model = KeyedVectors.load("models/normalized.model") fasttext.util.download_model('en', if_exists='ignore') model = fasttext.load_model('cc.en.300.bin') stop_words = set(stopwords.words('english')) vocab = set(model.words) cache={} try: with open("cache.pkl", "rb") as f: cache=pickle.load(f) except: pass def shuffleDict(dictionary): keys = list(dictionary.keys()) random.shuffle(keys) shuffled = {} for key in keys:
from app import app, db from app.parser import parse_entities from app.models import Article, Entity from app.entity import load_all_entities, clean_entity, get_entity_info from app.summarize import summarize_text from app.ner import extract_entities from flask import request import fasttext import json import requests import re from sacremoses import MosesTokenizer import sentencepiece as spm id_model = fasttext.load_model(r'/home/dion/Downloads/work/textonomy/backend/app/fasttext_w2v_indon.bin') ms_model = fasttext.load_model(r'/home/dion/Downloads/work/textonomy/backend/app/fasttext_w2v_ms.bin') ENT_DF = load_all_entities() html = re.compile('<.*?>|<.*?>') mtoken_source = MosesTokenizer(lang='id') token_source = lambda text: mtoken_source.tokenize(re.sub(html, '', str(text)), return_str=True).strip().lower() indon_sp = spm.SentencePieceProcessor() indon_sp.load(r'/home/dion/Downloads/work/textonomy/backend/app/source.model') eng_sp = spm.SentencePieceProcessor() eng_sp.load(r'/home/dion/Downloads/work/textonomy/backend/app/target.model') @app.route('/api/articles', methods=['GET']) def get_all_articles():
def load_model(self): loaded_model = fasttext.load_model('fasttext_sarcasm.ftz') return loaded_model
import fasttext import pandas as pd import pdb from tqdm import tqdm model1 = fasttext.load_model("./model/fasttext_1.bin") model2 = fasttext.load_model("./model/fasttext_2.bin") model3 = fasttext.load_model("./model/fasttext_3.bin") test = pd.read_csv("../../data/test.txt", header=None, sep='\t') checking = pd.read_csv("./checking_sheet.csv") answers = [] for sent in tqdm(test[0]): checking['score'] = 0 ans = model1.predict(sent, k=10) for i in range(10): label = ans[0][i][9:] score = ans[1][i] checking.loc[checking['level1'] == label, 'score'] += score ans = model1.predict(sent, k=10) ans = model2.predict(sent, k=10) for i in range(10): label = ans[0][i][9:] score = ans[1][i] checking.loc[checking['level2'] == label, 'score'] += score ans = model3.predict(sent, k=10) for i in range(10): label = ans[0][i][9:] score = ans[1][i]
import fasttext import numpy as np import scipy import nltk PRETRAINED_MODEL_PATH = "vectors/english/cc.en.300.bin" model = fasttext.load_model(PRETRAINED_MODEL_PATH) def cos_similarity(sentence, word): sent1_emb = model.get_sentence_vector(sentence) sent2_emb = model.get_word_vector(word) return (1 - scipy.spatial.distance.cosine(sent1_emb, sent2_emb)) good_barometer = "good" bad_barometer = "bad" test_good_sentence = "Wow, this is a really great sentence. I love it." test_bad_sentence = "This is terrible. I hate it." good_good = cos_similarity(test_good_sentence, good_barometer) good_bad = cos_similarity(test_good_sentence, bad_barometer) bad_bad = cos_similarity(test_bad_sentence, bad_barometer) bad_good = cos_similarity(test_bad_sentence, good_barometer) print("How good is the test good sentence?", good_good) print("How bad is the test good sentence?", good_bad) print("Test good sentence is most likely '{}'.".format( "good" if good_good > good_bad else "bad")) print("How bad is the test bad sentence?", bad_bad)
def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None: super().__init__(component_config) path = os.path.join(component_config["cache_dir"], component_config["file"]) self.model = fasttext.load_model(path)
from flask import render_template, request, jsonify, session from pipeline import * app = Flask(__name__) store = RedisStore(redis.StrictRedis()) KVSessionExtension(store, app) app.secret_key = 'PZ2HKD7WIAM1D708OE9I78KZ0' data_path = os.path.join('..', 'project_historian') models_path = os.path.join(data_path, 'models') rss_path = os.path.join(data_path, 'rss_data') model_path = os.path.join(models_path, 'fasttext_model.bin') db_path = os.path.join(rss_path, 'rss_database.db') model = fasttext.load_model(model_path) print('- Model loaded successfully.') @app.route('/') def index(): return render_template('index.html') @app.route('/query', methods=['POST']) def query(): keywords = [] if 'keywords' not in request.form: return None t_keywords = request.form['keywords'].split(',') keywords = ['_'.join(kw.strip().split()) for kw in t_keywords]
def initialization(): global logging_level parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('input', nargs='?', type=argparse.FileType('rt', errors="replace"), default=io.TextIOWrapper(sys.stdin.buffer, errors="replace"), help="Tab-separated bilingual tagged file") parser.add_argument('output', nargs='?', type=argparse.FileType('wt'), default=sys.stdout, help="Output of the classification") parser.add_argument( '--annotated_output', default=False, action='store_true', help= "Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting" ) #groupM = parser.add_argument_group('Mandatory') #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input") #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input") groupO = parser.add_argument_group('Optional') groupO.add_argument( '--tmp_dir', default=gettempdir(), help= "Temporary directory where creating the temporary files of this program" ) groupO.add_argument('-b', '--block_size', type=int, default=10000, help="Sentence pairs per block") groupO.add_argument('-p', '--processes', type=int, default=max(1, cpu_count() - 1), help="Number of processes to use") groupO.add_argument('--disable_lang_ident', default=False, action='store_true', help="Don't apply rules that use language detecting") groupO.add_argument('--disable_minimal_length', default=False, action='store_true', help="Don't apply minimal length rule") groupO.add_argument('--disable_porn_removal', default=False, action='store_true', help="Don't apply p**n removal") groupO.add_argument("-s", "--source_lang", type=str, default=None, help="Source language (SL) of the input") groupO.add_argument("-t", "--target_lang", type=str, default=None, help="Target language (TL) of the input") groupO.add_argument("--scol", default=1, type=check_positive, help="Source sentence column (starting in 1)") groupO.add_argument("--tcol", default=2, type=check_positive, help="Target sentence column (starting in 1)") groupO.add_argument("-S", "--source_tokenizer_command", default=None, type=str, help="Source language (SL) tokenizer full command") groupO.add_argument("-T", "--target_tokenizer_command", default=None, type=str, help="Target language (TL) tokenizer full command") #LM filtering groupO.add_argument('--disable_lm_filter', default=False, action='store_true', help="Don't apply LM filtering") groupO.add_argument('--metadata', type=argparse.FileType('r'), default=None, help="Training metadata (YAML file)") groupO.add_argument('--lm_threshold', type=check_positive_between_zero_and_one, default=0.5, help="Threshold for language model fluency scoring.") #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.") # Logging group groupL = parser.add_argument_group('Logging') groupL.add_argument('-q', '--quiet', action='store_true', help='Silent logging mode') groupL.add_argument('--debug', action='store_true', help='Debug logging mode') groupL.add_argument('--logfile', type=argparse.FileType('a'), default=sys.stderr, help="Store log to a file") #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit") args = parser.parse_args() logging_setup(args) logging_level = logging.getLogger().level # Ensure that directory exists; if not, create it if not os.path.exists(args.tmp_dir): os.makedirs(args.tmp_dir) #Try loading metadata for LM filtering and p**n removal if not (args.disable_lm_filter and args.disable_porn_removal) and args.metadata != None: logging.info("Loading metadata info") try: args.metadata_yaml = yaml.safe_load(args.metadata) args.metadata_yaml["yamlpath"] = os.path.dirname( os.path.abspath(args.metadata.name)) if not ("source_lm" in args.metadata_yaml and "target_lm" in args.metadata_yaml): args.disable_lm_filter = True logging.warning("LM file not present in metadata.") if not ("porn_removal_file" in args.metadata_yaml): args.disable_porn_removal = True logging.warning( "P**n removal classifier not present in metadata.") else: try: args.porn_removal = fasttext.load_model( os.path.join(args.metadata_yaml["yamlpath"], args.metadata_yaml['porn_removal_file'])) except: args.porn_removal = fasttext.load_model( args.metadata_yaml['porn_removal_file']) if "source_tokenizer_command" in args.metadata_yaml: args.source_tokenizer_command = args.metadata_yaml[ "source_tokenizer_command"] if "target_tokenizer_command" in args.metadata_yaml: args.target_tokenizer_command = args.metadata_yaml[ "target_tokenizer_command"] parser.set_defaults(**args.metadata_yaml) except: logging.warning("Error loading metadata.") args.disable_lm_filter = True args.disable_porn_removal = True traceback.print_exc() #sys.exit(1) else: if args.metadata == None: logging.warning("Metadata file not provided.") args.disable_lm_filter = True args.disable_porn_removal = True if (args.source_lang == None or args.target_lang == None): if (args.metadata == None): logging.error("No source or target languages provided.") sys.exit(1) else: try: if not "metadata_yaml" in args or args.metadata_yaml == None: args.metadata_yaml = yaml.safe_load(args.metadata) #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name)) args.source_lang = args.metadata_yaml["source_lang"] args.target_lang = args.metadata_yaml["target_lang"] except: traceback.print_exc() logging.error( "Error retrieving source or target languages from metadata." ) sys.exit(1) if args.disable_lm_filter: logging.info("LM filtering disabled.") if args.disable_porn_removal: logging.info("P**n removal disabled.") return args
def prediction(n_clicks, uploaded_filenames): threshold = 0.80 global mapping_dict print(n_clicks, uploaded_filenames) if n_clicks is not None and uploaded_filenames is not None and threshold is not None: page_text = '' doc = fitz.open(".\\assets\\docs\\" + uploaded_filenames[0]) print(doc.pageCount) for i in range(doc.pageCount): page = doc.loadPage(i) page_str = page.getText("text") page_text = page_text + page_str text = page_text.lower() # /*********************Remove number*******************/ text = re.sub(r'\d+', ' ', text) # /*****************Remove Punctuation****************/ text = re.sub(r'[^\w\s]', ' ', text) # /*****************Remove \xa0****************/ text = re.sub(r'\xa0', '', text) # /*****************Remove \x0c****************/ text = re.sub(r'\x0c', '', text) # /*****************Remove stop words************/ token_text = word_tokenize(text) tokens_without_sw = [word for word in token_text if not word in stop_words] text_stem = [ps.stem(word) for word in tokens_without_sw] text = (" ").join(text_stem) # /***************Remove space line character*********/ text = text.replace('\n', ' ') # /********************Remove duplicate space**********/ text = " ".join(text.split()) # /**********Common word removal************/ model = fasttext.load_model(join(project_root,'RTA_Future_Scanner.bin')) predicted_label_1 = model.predict(text, k=-1)[0][0] predicted_label_1_probab = model.predict(text, k=-1)[1][0] predicted_label_2 = model.predict(text, k=-1)[0][1] predicted_label_2_probab = model.predict(text, k=-1)[1][1] predicted_label_3 = model.predict(text, k=-1)[0][2] predicted_label_3_probab = model.predict(text, k=-1)[1][2] predicted_label_4 = model.predict(text, k=-1)[0][3] predicted_label_4_probab = model.predict(text, k=-1)[1][3] predicted_label_1 = predicted_label_1.replace("__label__", '').replace("__n_", '') predicted_label_2 = predicted_label_2.replace("__label__", '').replace("__n_", '') predicted_label_3 = predicted_label_3.replace("__label__", '').replace("__n_", '') predicted_label_4 = predicted_label_4.replace("__label__", '').replace("__n_", '') predicted_label_1 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_1)) predicted_label_1 = mapping_dict.get("".join(predicted_label_1.lower().split(" ")), predicted_label_1) predicted_label_2 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_2)) predicted_label_2 = mapping_dict.get("".join(predicted_label_2.lower().split(" ")), predicted_label_2) predicted_label_3 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_3)) predicted_label_3 = mapping_dict.get("".join(predicted_label_3.lower().split(" ")), predicted_label_3) predicted_label_4 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_4)) predicted_label_4 = mapping_dict.get("".join(predicted_label_4.lower().split(" ")), predicted_label_4) Confidence_Score = '-' j1 = "" j2 = 0 j3 = "" j4 = 0 j5 = "" j6 = 0 if predicted_label_1_probab >= threshold: Sample1 = [] Sample1.append(predicted_label_1) Sample1.append(predicted_label_1_probab) j1 = (Sample1[0]) j2 = round(Sample1[1], 2) elif (predicted_label_1_probab + predicted_label_2_probab) >= threshold: Sample1 = [] Sample1.append(predicted_label_1) Sample1.append(predicted_label_1_probab) Sample2 = [] Sample2.append(predicted_label_2) Sample2.append(predicted_label_2_probab) j1 = (Sample1[0]) j2 = round(Sample1[1], 2) j3 = Sample2[0] j4 = round(Sample2[1], 2) elif (predicted_label_1_probab + predicted_label_2_probab + predicted_label_3_probab) >= threshold: Sample1 = [] Sample1.append(predicted_label_1) Sample1.append(predicted_label_1_probab) Sample2 = [] Sample2.append(predicted_label_2) Sample2.append(predicted_label_2_probab) Sample3 = [] Sample3.append(predicted_label_3) Sample3.append(predicted_label_3_probab) j1 = (Sample1[0]) j2 = round(Sample1[1], 2) j3 = Sample2[0] j4 = round(Sample2[1], 2) j5 = Sample3[0] j6 = round(Sample3[1], 2) else: j1 = '-' j2 = '-' j3 = '-' j4 = '-' j5 = '-' j6 = '-' j2 = str(j2 * 100) + " %" if j2 != "-" else str(j2) j4 = str(j4 * 100) + " %" if j4 != "-" else str(j4) j6 = str(j6 * 100) + " %" if j6 != "-" else str(j6) text_lem = [wn.lemmatize(word) for word in tokens_without_sw] word_text = (" ").join(text_lem) # /***************Remove space line character*********/ word_text = word_text.replace('\n', ' ') # /********************Remove duplicate space**********/ word_text = " ".join(word_text.split()) tfidf_vectorizer = TfidfVectorizer() top_unigram_words = wc(word_text, tfidf_vectorizer) tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2)) top_bigram_words = wc(word_text, tfidf_vectorizer) tfidf_vectorizer = TfidfVectorizer(ngram_range=(3, 3)) top_trigram_words = wc(word_text, tfidf_vectorizer) test = [] test = top_unigram_words # + top_bigram_words + top_trigram_words d = {} for a, x in test: d[a] = 10 * x wordcloud = WordCloud(width=1450, height=700, background_color='white') wordcloud.generate_from_frequencies(frequencies=d) wordcloud.to_file(join(data_Path,'wc.png')) # words = list(d.keys()) # weights = [round(each) for each in list(d.values())] wordcloud_fig = word_cloud() df = { "Prediction": [j1, j3, j5], "Probability": [j2, j4, j6], } df = pd.DataFrame(df) df = df[df['Prediction'] != ""] # fig.show() print("reached here=====================================================1") return df.to_dict('records'), [{"name": i, "id": i} for i in df.columns], wordcloud_fig, ""
import fasttext model = fasttext.load_model( '/users/aaronrank/developer/recipe-ai/recipeai/recipes/.ingredient_classifier_model' ) def predict(text): '''Returns label, confidence''' clf = model.predict(text) return clf[0][0], clf[1][0]
def prep_emb(fn, gen_emb, domain_emb, prep_dir, gen_dim=300, domain_dim=100): text = [] with open(fn) as f: for line in f: ob = json.loads(line) review = ob["text"] token = word_tokenize(review) text=text+token vocab = sorted(set(text)) word_idx = {} if os.path.exists(prep_dir+'word_idx.json'): with io.open(prep_dir+'word_idx.json') as f: prev_word = json.load(f) else: prev_word = {} wx = 0 new_word = [] for word in vocab: if word not in prev_word: wx = wx+1 new_word.append(word) word_idx[word] = wx+len(prev_word) prev_word.update(word_idx) if new_word == []: return # create embedding embedding_gen=np.zeros((len(prev_word)+2, gen_dim) ) embedding_domain=np.zeros((len(prev_word)+2, domain_dim) ) if os.path.exists(prep_dir+'gen.vec.npy'): gen_emb_prev=np.load(prep_dir+"gen.vec.npy") embedding_gen[:gen_emb_prev.shape[0],:] = gen_emb_prev if os.path.exists(prep_dir+'gen.vec.npy'): domain_emb_prev=np.load(prep_dir+'restaurant_emb.vec.npy') embedding_domain[:domain_emb_prev.shape[0],:] = domain_emb_prev with open(gen_emb) as f: # read the embedding .vec file for l in f: rec=l.rstrip().split(' ') if len(rec)==2: #skip the first line. continue # if the word in word_idx, fill the embedding if rec[0] in new_word: embedding_gen[prev_word[rec[0]]] = np.array([float(r) for r in rec[1:] ]) with open(domain_emb) as f: # read the embedding .vec file for l in f: # for each line, get the word and its vector rec=l.rstrip().split(' ') if len(rec)==2: #skip the first line. continue # if the word in word_idx, fill the embedding if rec[0] in new_word: embedding_domain[prev_word[rec[0]]] = np.array([float(r) for r in rec[1:] ]) ftmodel = load_model(domain_emb+".bin") for w in new_word: if embedding_domain[word_idx[w] ].sum()==0.: embedding_domain[word_idx[w] ] = ftmodel.get_word_vector(w) with io.open(prep_dir+'word_idx.json', 'w') as outfile: outfile.write(json.dumps(prev_word)) np.save(prep_dir+'gen.vec.npy', embedding_gen.astype('float32') ) np.save(prep_dir+'restaurant_emb.vec.npy', embedding_domain.astype('float32') )
'../../TSD/augmented_labels/data/normalized/transcripts/swedish/test.txt' ) tags_test = prepare_data.load_tags( '../../TSD/augmented_labels/data/normalized/ner/swedish/ner_test.txt') # compare againt conventional NER #features_test = prepare_data.load_features_combined('../augmented_labels/data/normalized/features/test.npy') #target_test = prepare_data.load_transcripts('output/parliament/e2e_asr_combined.txt') #tags_test = prepare_data.load_tags('output/parliament/conventional_ner.txt') features_test = features_test[:50] target_test = target_test[:50] tags_test = tags_test[:50] print('Loading embeddings...') embeddings = fasttext.load_model('weights/embeddings/cc.sv.300.bin') print('Done...') tag2idx = {'O': 1, 'PER': 2, 'LOC': 3, 'ORG': 4} idx2tag = {1: 'O', 2: 'PER', 3: 'LOC', 4: 'ORG'} with open('weights/char2idx_swe.pkl', 'rb') as f: char2idx = pickle.load(f) with open('weights/idx2char_swe.pkl', 'rb') as f: idx2char = pickle.load(f) char2idx['~'] = len(char2idx) + 1 idx2char[len(idx2char) + 1] = '~' char2idx_ctc = {} idx2char_ctc = {}
# trans.overSampling(file_prefix + train_file, "0", "1") # trans.overSampling(file_prefix + valid_file, "0", "1") # trans.overSampling(file_prefix + test_file, "0", "1") # else: # trans.overSampling(file_prefix + train_file, "1", "2", "3") # trans.overSampling(file_prefix + valid_file, "1", "2", "3") # trans.overSampling(file_prefix + test_file, "1", "2", "3") """ get and save the training model """ model = [None for i in range(4)] # for i in range(1, 4): # file_prefix = "villa/stage" + str(i) + "/" # model[i] = anal.train(file_prefix + train_file, file_prefix + valid_file) # model[i].save_model("villa/model_stage" + str(i) + "_jdComment.bin") """ test the training model """ for i in range(1, 4): model[i] = fasttext.load_model("villa/model_stage" + str(i) + "_jdComment.bin") # print(model[i].test("villa/stage" + str(i) + "/" + test_file)) tot = 0 bingo = 0 with open("villa/" + test_file) as infile: for row in infile: i = 9 tag = "" while row[i] != ' ': tag += row[i] i += 1 res = anal.predictComment(model, row[i + 1:-2]) if res == tag: bingo += 1 tot += 1
def load(self, path): return fasttext.load_model(path)
def load(cls, load_dir, batch_size=4, gpu=False, embedder_only=True): import fasttext if os.path.isfile(load_dir): return cls(model=fasttext.load_model(load_dir)) else: logger.error(f"Fasttext model file does not exist at: {load_dir}")
def main(): data_path = '/Users/ruizhang/Documents/NLP_dataset/' ############# # ############ # Load train set train_file = data_path +'dbpedia_csv/train.csv' df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'dbpedia_csv/test.csv' df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Mapping from class number to class name class_dict = { 1: 'Company', 2: 'EducationalInstitution', 3: 'Artist', 4: 'Athlete', 5: 'OfficeHolder', 6: 'MeanOfTransportation', 7: 'Building', 8: 'NaturalPlace', 9: 'Village', 10: 'Animal', 11: 'Plant', 12: 'Album', 13: 'Film', 14: 'WrittenWork' } df['class_name'] = df['class'].map(class_dict) df.head() ############# # ############ desc = df.groupby('class') desc.describe().transpose() # Transform datasets df_train_clean = clean_dataset(df, True, False) df_test_clean = clean_dataset(df_test, False, False) # Write files to disk train_file_clean = data_path + 'dbpedia.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'dbpedia.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) # Train a classifier output_file = data_path + 'dp_model' classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__') result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.'] labels1 = classifier.predict(sentence1) class1 = int(labels1[0][0]) print("Sentence: ", sentence1[0]) print("Label: %d; label name: %s" % (class1, class_dict[class1])) sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.'] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class2 as string print("Sentence: ", sentence2[0]) print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!'] number_responses = 3 labels3 = classifier.predict_proba(sentence3, k=number_responses) print("Sentence: ", sentence3[0]) for l in range(number_responses): class3, prob3 = labels3[0][l] print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3)) # Load train set train_file = data_path + 'amazon_review_polarity_train.csv' df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description']) # Load test set test_file = data_path + 'amazon_review_polarity_test.csv' df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description']) # Transform datasets df_train_clean = clean_dataset(df_sentiment_train, True, False) df_test_clean = clean_dataset(df_sentiment_test, False, False) # Write files to disk train_file_clean = data_path + 'amazon.train' df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description']) test_file_clean = data_path + 'amazon.test' df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description']) dim = 10 lr = 0.1 epoch = 5 min_count = 1 word_ngrams = 2 bucket = 10000000 thread = 12 label_prefix = '__label__' # Train a classifier output_file = data_path + 'amazon_model' classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch, min_count=min_count, word_ngrams=word_ngrams, bucket=bucket, thread=thread, label_prefix=label_prefix) # Evaluate classifier result = classifier.test(test_file_clean) print('P@1:', result.precision) print('R@1:', result.recall) print('Number of examples:', result.nexamples) class_dict = { 1: "Negative", 2: "Positive" } sentence1 = ["The product design is nice but it's working as expected"] labels1 = classifier.predict_proba(sentence1) class1, prob1 = labels1[0][0] # it returns class as string print("Sentence: ", sentence1[0]) # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1)) sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"] labels2 = classifier.predict_proba(sentence2) class2, prob2 = labels2[0][0] # it returns class as string print("Sentence: ", sentence2[0]) # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2)) url = "https://twitter.com/miguelgfierro/status/805827479139192832" response = urlopen(url).read() title = str(response).split('<title>')[1].split('</title>')[0] print(title) # # Format tweet # tweet = unescape(title) # print(tweet) # # # Classify tweet # label_tweet = classifier.predict_proba([tweet]) # class_tweet, prob_tweet = label_tweet[0][0] # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet)) wiki_dataset_original = data_path + 'enwik9' wiki_dataset = data_path + 'text9' if not os.path.isfile(wiki_dataset): os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset) output_skipgram = data_path + 'skipgram' if os.path.isfile(output_skipgram + '.bin'): skipgram = fasttext.load_model(output_skipgram + '.bin') else: skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5, epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6, thread=4, t=1e-4, lr_update_rate=100) print(np.asarray(skipgram['king'])) print("Number of words in the model: ", len(skipgram.words)) # Get the vector of some word Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum() print(Droyals) Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople) Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum() print(Dpeople2) print(len(skipgram.words)) targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother', 'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy'] classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] X_target = [] for w in targets: X_target.append(skipgram[w]) X_target = np.asarray(X_target) word_list = list(skipgram.words)[:10000] X_subset = [] for w in word_list: X_subset.append(skipgram[w]) X_subset = np.asarray(X_subset) X_target = np.concatenate((X_subset, X_target)) print(X_target.shape) X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact', random_state=0, n_iter=200, verbose=2).fit_transform(X_target) print(X_tsne.shape) X_tsne_target = X_tsne[-20:, :] print(X_tsne_target.shape) plot_words(X_tsne_target, targets, classes=classes) plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
def __init__(self): self.model = fasttext.load_model("model.ftz")
# _*_conding:utf8 _*_ import jieba import re import logging # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import fasttext import os #######训练模型########### model_path = './tmp/senti_model.model' if os.path.exists(model_path + '.bin'): classifier = fasttext.load_model(model_path + '.bin') else: train_file = './tmp/training.txt' # model_path = './experiment/fasttext-classification/senti_model.model' classifier = fasttext.supervised(train_file, model_path, label_prefix="__label__") test_file = './tmp/test.txt' result = classifier.test(test_file) print(result.precision) print(result.recall) ########加载停用词列表####### stop_word_path = '../stopwords_cn.txt' stop_word = [] with open(stop_word_path, 'r') as f: for line in f.readlines(): stop_word.append(line.strip()) punction_list = list('、,。?!:;“”¥%&*@~#()】【,.?!;:" "')
def __init__(self, model): self.model = fasttext.load_model(model)