Пример #1
0
    def test_load_skipgram_model(self):
        model = ft.load_model(skipgram_file, encoding='utf-8')

        # Make sure the model is returned correctly
        self.assertEqual(model.model_name, 'skipgram')

        # Make sure all params loaded correctly
        # see Makefile on target test-skipgram for the params
        self.assertEqual(model.dim, 100)
        self.assertEqual(model.ws, 5)
        self.assertEqual(model.epoch, 1)
        self.assertEqual(model.min_count, 1)
        self.assertEqual(model.neg, 5)
        self.assertEqual(model.loss_name, 'ns')
        self.assertEqual(model.bucket, 2000000)
        self.assertEqual(model.minn, 3)
        self.assertEqual(model.maxn, 6)
        self.assertEqual(model.lr_update_rate, 100)
        self.assertEqual(model.t, 1e-4)

        # Make sure the vector have the right dimension
        self.assertEqual(len(model['the']), model.dim)

        # Make sure we support unicode character
        unicode_str = 'Καλημέρα'
        self.assertTrue(unicode_str in model.words)
        self.assertEqual(len(model[unicode_str]), model.dim)
Пример #2
0
def embed_titles(infile=TOPICS, outfile=EMBEDDED_TOPIC_TITLES_PATH):
    '''
    Vectorize topics with fasttext and save to file
    ntopic=188, embeddings_dim=100
    '''
    with open(infile, "r") as f:
        topics_json = json.load(f)

    topic_titles = []
    topic_vectors = []

    model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)

    for i, topic in enumerate(topics_json):
        title = topic['title']
        # title = topic['title']
        # title = topic['title']
        # print title
        topic_titles.append(title)
        # embed titles with fasttext
        topic_vectors.append(model[title])

    assert len(topic_vectors) == len(topic_titles)

    with open(outfile, 'w') as f:
        pickle.dump(topic_vectors, f)

    return topic_titles, topic_vectors
Пример #3
0
def embed_topics(infile=TOPICS, outfile=EMBEDDED_TOPICS_PATH):
    '''
    Vectorize topics with fasttext and save to file
    ntopic=188, embeddings_dim=100
    '''
    with open(infile, "r") as f:
        topics_json = json.load(f)

    topic_vectors = []

    model = fasttext.load_model(EMBEDDINGS_MODEL_PATH)

    for i, topic in enumerate(topics_json):
        title = topic['title']
        description = topic['title']
        narrative = topic['title']

        joint = " ".join([title, description, narrative])
        # embed topics with fasttext
        topic_vectors.append(model[joint])

    assert len(topic_vectors) == len(topics_json)

    with open(outfile, 'w') as f:
        pickle.dump(topic_vectors, f)
Пример #4
0
 def load_model(cls):
     """
     模型加载
     """
     config = get_config()
     model_path = '{}.bin'.format(config.get('train', 'model_path'))
     if os.path.exists(model_path):
         cls.__model = ft.load_model(model_path)
Пример #5
0
 def __init__(self, model_path=EMBEDDINGS_MODEL_PATH,
              embeddings_path=EMBEDDED_TOPICS_PATH):
     # load fasttext model
     self.model = fasttext.load_model(model_path)
     # load topic titles
     self.topic_titles = load_titles()
     # load topic embeddings
     with open(embeddings_path, 'r') as f:
         self.topic_vectors = pickle.load(f)
Пример #6
0
    def load(self):
        if self.flavor == 'w2v':
            self.model = Word2Vec.load(self.path)
            self.model.init_sims(replace=True)
            self.size = self.model.size

        elif self.flavor == 'ft':
            self.model = fasttext.load_model(self.path + '.bin')
            self.size = self.model.dim

        self.fitted = True
Пример #7
0
    def loadfromfile(cls, repdir, word_model_file):
        ont = pickle.load(open(repdir+'/ont.pickle',"rb" )) 

        class Config(object):
            def __init__(self, d):
                self.__dict__ = d
        config = Config(json.load(open(repdir+'/config.json', 'r')))
        #config = Config
        #config.__dict__ = json.load(open(repdir+'/config.json', 'r'))

        word_model = fasttext.load_model(word_model_file)

        model = cls(config, ont, word_model)
        model.load_params(repdir)
        return model
Пример #8
0
    def load(self, *args, **kwargs):
        """
        Load dict of embeddings from file
        Args:
            fname: file name
        """

        if self.load_path:
            if self.load_path.is_file():
                print("[loading embeddings from `{}`]".format(self.load_path))
                model_file = str(self.load_path)
                if self.emb_module == 'fasttext':
                    import fasttext as Fasttext
                    # model = Fasttext.load_model(model_file)
                    model = Fasttext.load_model(model_file)
                elif self.emb_module == 'pyfasttext':
                    from pyfasttext import FastText as Fasttext
                    model = Fasttext(model_file)
                else:
                    from gensim.models.wrappers.fasttext import FastText as Fasttext
                    model = Fasttext.load_fasttext_format(model_file)
            elif isinstance(self.load_path, Path):
                raise ConfigError("Provided `load_path` for {} doesn't exist!".format(
                    self.__class__.__name__))
        else:
            warn("No `load_path` is provided for {}".format(self.__class__.__name__))
            if self.embedding_url:
                try:
                    print('[trying to download a pretrained fasttext model from repository]')
                    local_filename, _ = urllib.request.urlretrieve(self.embedding_url)
                    with open(local_filename, 'rb') as fin:
                        model_file = fin.read()

                    mp = self.save_path
                    self.load_path = self.save_path
                    model = self.load()
                    print("[saving downloaded fasttext model to {}]".format(mp))
                    with open(str(mp), 'wb') as fout:
                        fout.write(model_file)
                except Exception as e:
                    raise RuntimeError(
                        'Looks like the provided fasttext url is incorrect', e)
            else:
                raise FileNotFoundError(
                    'No pretrained fasttext model provided or provided "load_path" is incorrect.'
                    ' Please include "load_path" to json.')

        return model
Пример #9
0
    def test_load_cbow_model(self):
        model = ft.load_model(cbow_file)

        # Make sure the model is returned correctly
        self.assertEqual(model.model_name, 'cbow')

        # Make sure all params loaded correctly
        # see Makefile on target test-cbow for the params
        self.assertEqual(model.dim, 50)
        self.assertEqual(model.ws, 5)
        self.assertEqual(model.epoch, 1)
        self.assertEqual(model.min_count, 3)
        self.assertEqual(model.neg, 5)
        self.assertEqual(model.loss_name, 'ns')
        self.assertEqual(model.bucket, 2000000)
        self.assertEqual(model.minn, 3)
        self.assertEqual(model.maxn, 6)
        self.assertEqual(model.lr_update_rate, 100)
        self.assertEqual(model.t, 1e-4)

        # Make sure the vector have the right dimension
        self.assertEqual(len(model.get_vector('the')), model.dim)
Пример #10
0
 def __init__(self, model_path):
     self.model_path = model_path
     self.model = fasttext.load_model(model_path)
     self.n_labels = len(self.model.get_labels())
     self.max_entropy = -1 * np.log(1.0 / self.n_labels)
Пример #11
0
def fasttext_predict(model_filepath, test_dataset):
    classifier = fasttext.load_model(model_filepath, label_prefix='__label__')
    result = classifier.test(test_dataset)
    print(result)
    return result
Пример #12
0
# -*- coding:utf-8 -*-

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
import fasttext
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
#load训练好的模型
classifier = fasttext.load_model('comment_code_fasttext.model.bin',
                                 label_prefix='__label__')
result = classifier.test("test.txt")

print
print "precision:", (result.precision)

print(result.recall)
labels_right = []
texts = []
with open("test.txt") as fr:
    lines = fr.readlines()
for line in lines:
    if line == '\n':
        continue
    labels_right.append(
        line.split("\t")[1].rstrip().replace("__label__", "").encode('utf-8'))
    texts.append(line.split("\t")[0].decode("utf-8"))
#     print labels
#     print texts
#     break
labels_predict = [e[0].encode('utf-8')
Пример #13
0
 def load(self):
     if os.path.exists(self.model_path + 'bin'):
         return fasttext.load_model(self.model_path)
     else:
         return None
Пример #14
0
import numpy as np
import tensorflow as tf
import fasttext as ft
import math
import sys
from cnn_lstm_crf import CNN_BLSTM_CRF

Word2vec = ft.load_model('vi.bin')


def make_char_dictionary(data_path, dict_path):
    ### initialize dictionary set
    char_dictionary = ['<UNK>', '<PAD>']
    ### make character dictionary
    f = open(data_path, 'r')
    for row in f:
        row_split = row[:-1].split(' ')
        for word in row:
            for char in word:
                char_dictionary.append(char)
    f.close()
    ### remove duplicate characters
    char_dictionary = list(set(char_dictionary))
    ### save character dictionary
    f = open(dict_path, 'w')
    for char in char_dictionary:
        f.write(char + '\n')
    f.close()


def load_dictionary(dict_path):
Пример #15
0

#train
#classfier=fasttext.supervised('news_fasttext_train.txt','news_fasttext.model',label_prefix='__label__')


#test

clf = fasttext.load_model('news_fasttext.model.bin',label_prefix = '__label__')
rel = clf.test('news_fasttext_test.txt')
print(rel.precision)
print(rel.recall)
'''

#测试
clf = fasttext.load_model('news_fasttext.model.bin')
text = [
    '最高人民法宣宣布周某某因涉嫌贪污受贿,利用不正当手段为他人谋取各种利益等,判处其无期徒刑,剥夺政治权利终身。', '婚姻大事不必铺张浪费',
    '小编祝大家新年快乐', '中国大陆多次强调,编排出武力夺取台湾',
    '它被誉为天下第一果,补益气血,养阴生津,现在吃正应季!  六七月是桃子大量上市的季节,因其色泽红润,肉质鲜美,有个在实验基地里接受治疗的妹子。广受大众的喜爱。’'
]
label = clf.predict(text)
print(label)
'''




#训练词向量

model = fasttext.skipgram('news_fasttext_train.txt','model1')
import re
import numpy as np
import fasttext
model = fasttext.load_model("cc.vi.300.bin")
import torch
from collections import Counter
from typing import List
# Load PhoBERT-base in fairseq
from fairseq.models.roberta import RobertaModel
phobert = RobertaModel.from_pretrained('PhoBERT_base_fairseq',
                                       checkpoint_file='model.pt')
phobert.eval()  # disable dropout (or leave in train mode to finetune)

# Incorporate the BPE encoder into PhoBERT-base
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq import options

parser = options.get_preprocessing_parser()
parser.add_argument('--bpe-codes',
                    type=str,
                    help='path to fastBPE BPE',
                    default="PhoBERT_base_fairseq/bpe.codes")
args = parser.parse_args()
phobert.bpe = fastBPE(args)


def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor,
                       other_tokens: List[str]):
    """
    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
    Args:
def exportByDistance(action, modelFileExtension, modelsFolder, fromYear,
                     toYear, neighborsCount, fasttextPath):
    """

    @param action:
    @param modelFileExtension:
    @param modelsFolder:
    @param fromYear:
    @param toYear:
    @param neighborsCount:
    @param fasttextPath:
    @return:
    @rtype: None
    """
    fromYearFilename = fromYear + modelFileExtension
    toYearFilename = toYear + modelFileExtension

    modelA = fasttext.load_model(os.path.join(modelsFolder, fromYearFilename))
    modelB = fasttext.load_model(os.path.join(modelsFolder, toYearFilename))

    clearVectorModelA = {}
    clearVectorModelB = {}

    for label in modelA.get_labels():
        clearVectorModelA[label] = modelA.get_word_vector(label)

    for label in modelB.get_labels():
        clearVectorModelB[label] = modelB.get_word_vector(label)

    # alignedEmbeddingsB = vector.alignTwoEmbeddings(clearVectorModelA, clearVectorModelB)

    results = {}
    for word in modelA.words:
        if word in modelB.words:
            if action == 'getCD':
                results[word] = vector.getCosineDistance(
                    clearVectorModelA[word], clearVectorModelB[word])
            elif action == 'getCS':
                results[word] = vector.getCosineSimilarity(
                    clearVectorModelA[word], clearVectorModelB[word])

    if action == 'getCD':
        sortedResults = sorted(results.items(),
                               key=lambda x: x[1],
                               reverse=True)
    elif action == 'getCS':
        sortedResults = sorted(results.items(), key=lambda x: x[1])

    resultsPerPeriod = {}

    for wordTuple in sortedResults[:50]:
        word = wordTuple[0]
        resultsPerPeriod[word] = {}

        resultsPerPeriod[word][str(fromYear)] = getNeighboursForWord(
            word, fromYearFilename, modelsFolder, fasttextPath, neighborsCount)
        resultsPerPeriod[word][str(toYear)] = getNeighboursForWord(
            word, toYearFilename, modelsFolder, fasttextPath, neighborsCount)

    # print(resultsPerPeriod)
    file.exportTextToFile(resultsPerPeriod, './shifts.json', True)
Пример #18
0
 def __init__(self, model_path):
     super(FeatureGenerator, self).__init__()
     self.model = ft.load_model(model_path)
Пример #19
0
 def __init__(self, dataset='yelp'):
     acc_path = 'hoang/acc_' + str(dataset) + '.bin'
     ppl_path = 'hoang/ppl_' + str(dataset) + '.bin'
     self.classifier = fasttext.load_model(acc_path)
     self.ppl_model = kenlm.Model(ppl_path)
     self.dataset = dataset
Пример #20
0
INPUT_TXT = '/path/to/file.txt'
OUTPUT_PATH_SKIPGRAM = '/tmp/skipgram'
OUTPUT_PATH_CBOW = '/tmp/cbow'

# Learn the word representation using skipgram model
skipgram = fasttext.skipgram(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print skipgram['word']

# Learn the word representation using cbow model
cbow = fasttext.cbow(INPUT_TXT, OUTPUT_PATH, lr=0.02, dim=300, ws=5,
        epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
        thread=4, t=1e-4, lr_update_rate=100)

# Get the vector of some word
print cbow['word']

# Load pre-trained skipgram model
SKIPGRAM_BIN = OUTPUT_PATH_SKIPGRAM + '.bin'
skipgram = fasttext.load_model(SKIPGRAM_BIN)
print skipgram['word']

# Load pre-trained cbow model
CBOW_BIN = OUTPUT_PATH_CBOW + '.bin'
cbow = fasttext.load_model(CBOW_BIN)
print cbow['word']

Пример #21
0
import json
import pickle

import fasttext
from elasticsearch import Elasticsearch
from kafka import KafkaConsumer
from tunga.preprocessing import normalization
import spacy

es = Elasticsearch()

model = fasttext.load_model("/home/burak/Desktop/cc.en.300.bin")
rfc = pickle.load(open("../random_forest.model", 'rb'))
nlp = spacy.load('../../../data/buyuk')
consumer = KafkaConsumer('crawled_tweets',
                         bootstrap_servers=['localhost:9091'])
print(consumer)
i = 1
for message in consumer:
    data = json.loads(message.value)

    tweet_text = data["tweet"]
    tweet_text = tweet_text.lower()
    tweet_text = tweet_text.strip()
    tweet_text = tweet_text.replace("\n", " ")
    tweet_text = normalization.remove_url(tweet_text)
    tweet_text = normalization.remove_hashtag(tweet_text)
    tweet_text = normalization.remove_emojis(tweet_text)

    prediction = rfc.predict([model.get_sentence_vector(tweet_text)])[0]
    if prediction == 0:
Пример #22
0
 def __init__(self, path_fasttext_model):
     self.fasttext_model = fasttext.load_model(path_fasttext_model)
Пример #23
0
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import textrank
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
import fasttext.util
import pickle
import os


#model = KeyedVectors.load("models/normalized.model")
fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('cc.en.300.bin')
stop_words = set(stopwords.words('english'))
vocab = set(model.words)

cache={}
try:
    with open("cache.pkl", "rb") as f:
        cache=pickle.load(f)
except:
    pass

def shuffleDict(dictionary):
    keys = list(dictionary.keys())
    random.shuffle(keys)
    shuffled = {}
    for key in keys:
Пример #24
0
from app import app, db
from app.parser import parse_entities
from app.models import Article, Entity
from app.entity import load_all_entities, clean_entity, get_entity_info
from app.summarize import summarize_text
from app.ner import extract_entities
from flask import request
import fasttext
import json
import requests
import re
from sacremoses import MosesTokenizer
import sentencepiece as spm

id_model = fasttext.load_model(r'/home/dion/Downloads/work/textonomy/backend/app/fasttext_w2v_indon.bin')
ms_model = fasttext.load_model(r'/home/dion/Downloads/work/textonomy/backend/app/fasttext_w2v_ms.bin')
ENT_DF = load_all_entities()

html = re.compile('<.*?>|&lt;.*?&gt;')
mtoken_source = MosesTokenizer(lang='id')
token_source = lambda text: mtoken_source.tokenize(re.sub(html, '', str(text)), return_str=True).strip().lower()

indon_sp = spm.SentencePieceProcessor()
indon_sp.load(r'/home/dion/Downloads/work/textonomy/backend/app/source.model')

eng_sp = spm.SentencePieceProcessor()
eng_sp.load(r'/home/dion/Downloads/work/textonomy/backend/app/target.model')


@app.route('/api/articles', methods=['GET'])
def get_all_articles():
Пример #25
0
 def load_model(self):
     loaded_model = fasttext.load_model('fasttext_sarcasm.ftz')
     return loaded_model
Пример #26
0
import fasttext
import pandas as pd
import pdb
from tqdm import tqdm

model1 = fasttext.load_model("./model/fasttext_1.bin")
model2 = fasttext.load_model("./model/fasttext_2.bin")
model3 = fasttext.load_model("./model/fasttext_3.bin")
test = pd.read_csv("../../data/test.txt", header=None, sep='\t')
checking = pd.read_csv("./checking_sheet.csv")

answers = []
for sent in tqdm(test[0]):
    checking['score'] = 0
    ans = model1.predict(sent, k=10)
    for i in range(10):
        label = ans[0][i][9:]
        score = ans[1][i]
        checking.loc[checking['level1'] == label, 'score'] += score
        ans = model1.predict(sent, k=10)

    ans = model2.predict(sent, k=10)
    for i in range(10):
        label = ans[0][i][9:]
        score = ans[1][i]
        checking.loc[checking['level2'] == label, 'score'] += score

    ans = model3.predict(sent, k=10)
    for i in range(10):
        label = ans[0][i][9:]
        score = ans[1][i]
Пример #27
0
import fasttext
import numpy as np
import scipy
import nltk

PRETRAINED_MODEL_PATH = "vectors/english/cc.en.300.bin"
model = fasttext.load_model(PRETRAINED_MODEL_PATH)


def cos_similarity(sentence, word):
    sent1_emb = model.get_sentence_vector(sentence)
    sent2_emb = model.get_word_vector(word)
    return (1 - scipy.spatial.distance.cosine(sent1_emb, sent2_emb))


good_barometer = "good"
bad_barometer = "bad"

test_good_sentence = "Wow, this is a really great sentence. I love it."
test_bad_sentence = "This is terrible. I hate it."

good_good = cos_similarity(test_good_sentence, good_barometer)
good_bad = cos_similarity(test_good_sentence, bad_barometer)
bad_bad = cos_similarity(test_bad_sentence, bad_barometer)
bad_good = cos_similarity(test_bad_sentence, good_barometer)

print("How good is the test good sentence?", good_good)
print("How bad is the test good sentence?", good_bad)
print("Test good sentence is most likely '{}'.".format(
    "good" if good_good > good_bad else "bad"))
print("How bad is the test bad sentence?", bad_bad)
Пример #28
0
 def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
     super().__init__(component_config)
     path = os.path.join(component_config["cache_dir"], component_config["file"])
     self.model = fasttext.load_model(path)
Пример #29
0
from flask import render_template, request, jsonify, session
from pipeline import *

app = Flask(__name__)
store = RedisStore(redis.StrictRedis())
KVSessionExtension(store, app)
app.secret_key = 'PZ2HKD7WIAM1D708OE9I78KZ0'

data_path = os.path.join('..', 'project_historian')
models_path = os.path.join(data_path, 'models')
rss_path = os.path.join(data_path, 'rss_data')

model_path = os.path.join(models_path, 'fasttext_model.bin')
db_path = os.path.join(rss_path, 'rss_database.db')

model = fasttext.load_model(model_path)
print('- Model loaded successfully.')


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/query', methods=['POST'])
def query():
    keywords = []
    if 'keywords' not in request.form:
        return None
    t_keywords = request.form['keywords'].split(',')
    keywords = ['_'.join(kw.strip().split()) for kw in t_keywords]
Пример #30
0
def initialization():
    global logging_level

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)
    parser.add_argument('input',
                        nargs='?',
                        type=argparse.FileType('rt', errors="replace"),
                        default=io.TextIOWrapper(sys.stdin.buffer,
                                                 errors="replace"),
                        help="Tab-separated bilingual tagged file")
    parser.add_argument('output',
                        nargs='?',
                        type=argparse.FileType('wt'),
                        default=sys.stdout,
                        help="Output of the classification")
    parser.add_argument(
        '--annotated_output',
        default=False,
        action='store_true',
        help=
        "Adds an extra column with each sentence's evaluation (\"keep\" if the sentence is good, otherwise the reason for rejecting"
    )

    #groupM = parser.add_argument_group('Mandatory')
    #groupM.add_argument("-s", "--source_lang", type=str, required=True, help="Source language (SL) of the input")
    #groupM.add_argument("-t", "--target_lang", type=str, required=True, help="Target language (TL) of the input")

    groupO = parser.add_argument_group('Optional')
    groupO.add_argument(
        '--tmp_dir',
        default=gettempdir(),
        help=
        "Temporary directory where creating the temporary files of this program"
    )
    groupO.add_argument('-b',
                        '--block_size',
                        type=int,
                        default=10000,
                        help="Sentence pairs per block")
    groupO.add_argument('-p',
                        '--processes',
                        type=int,
                        default=max(1,
                                    cpu_count() - 1),
                        help="Number of processes to use")

    groupO.add_argument('--disable_lang_ident',
                        default=False,
                        action='store_true',
                        help="Don't apply rules that use language detecting")
    groupO.add_argument('--disable_minimal_length',
                        default=False,
                        action='store_true',
                        help="Don't apply minimal length rule")
    groupO.add_argument('--disable_porn_removal',
                        default=False,
                        action='store_true',
                        help="Don't apply p**n removal")

    groupO.add_argument("-s",
                        "--source_lang",
                        type=str,
                        default=None,
                        help="Source language (SL) of the input")
    groupO.add_argument("-t",
                        "--target_lang",
                        type=str,
                        default=None,
                        help="Target language (TL) of the input")

    groupO.add_argument("--scol",
                        default=1,
                        type=check_positive,
                        help="Source sentence column (starting in 1)")
    groupO.add_argument("--tcol",
                        default=2,
                        type=check_positive,
                        help="Target sentence column (starting in 1)")

    groupO.add_argument("-S",
                        "--source_tokenizer_command",
                        default=None,
                        type=str,
                        help="Source language (SL) tokenizer full command")
    groupO.add_argument("-T",
                        "--target_tokenizer_command",
                        default=None,
                        type=str,
                        help="Target language (TL) tokenizer full command")

    #LM  filtering
    groupO.add_argument('--disable_lm_filter',
                        default=False,
                        action='store_true',
                        help="Don't apply LM filtering")
    groupO.add_argument('--metadata',
                        type=argparse.FileType('r'),
                        default=None,
                        help="Training metadata (YAML file)")
    groupO.add_argument('--lm_threshold',
                        type=check_positive_between_zero_and_one,
                        default=0.5,
                        help="Threshold for language model fluency scoring.")
    #groupO.add_argument('--keep_lm_result',action='store_true', help="Add an additional column to the results with the language model fluency score.")

    # Logging group
    groupL = parser.add_argument_group('Logging')
    groupL.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Silent logging mode')
    groupL.add_argument('--debug',
                        action='store_true',
                        help='Debug logging mode')
    groupL.add_argument('--logfile',
                        type=argparse.FileType('a'),
                        default=sys.stderr,
                        help="Store log to a file")
    #groupL.add_argument('-v', '--version', action='version', version="%(prog)s " + __version__, help="show version of this script and exit")

    args = parser.parse_args()
    logging_setup(args)

    logging_level = logging.getLogger().level

    # Ensure that directory exists; if not, create it
    if not os.path.exists(args.tmp_dir):
        os.makedirs(args.tmp_dir)

    #Try loading metadata for LM filtering and p**n removal
    if not (args.disable_lm_filter
            and args.disable_porn_removal) and args.metadata != None:
        logging.info("Loading metadata info")

        try:
            args.metadata_yaml = yaml.safe_load(args.metadata)
            args.metadata_yaml["yamlpath"] = os.path.dirname(
                os.path.abspath(args.metadata.name))

            if not ("source_lm" in args.metadata_yaml
                    and "target_lm" in args.metadata_yaml):
                args.disable_lm_filter = True
                logging.warning("LM file not present in metadata.")
            if not ("porn_removal_file" in args.metadata_yaml):
                args.disable_porn_removal = True
                logging.warning(
                    "P**n removal classifier not present in metadata.")
            else:
                try:
                    args.porn_removal = fasttext.load_model(
                        os.path.join(args.metadata_yaml["yamlpath"],
                                     args.metadata_yaml['porn_removal_file']))
                except:
                    args.porn_removal = fasttext.load_model(
                        args.metadata_yaml['porn_removal_file'])

            if "source_tokenizer_command" in args.metadata_yaml:
                args.source_tokenizer_command = args.metadata_yaml[
                    "source_tokenizer_command"]
            if "target_tokenizer_command" in args.metadata_yaml:
                args.target_tokenizer_command = args.metadata_yaml[
                    "target_tokenizer_command"]

            parser.set_defaults(**args.metadata_yaml)

        except:
            logging.warning("Error loading metadata.")
            args.disable_lm_filter = True
            args.disable_porn_removal = True
            traceback.print_exc()
            #sys.exit(1)
    else:
        if args.metadata == None:
            logging.warning("Metadata file not provided.")
            args.disable_lm_filter = True
            args.disable_porn_removal = True

    if (args.source_lang == None or args.target_lang == None):
        if (args.metadata == None):
            logging.error("No source or target languages provided.")
            sys.exit(1)
        else:
            try:
                if not "metadata_yaml" in args or args.metadata_yaml == None:
                    args.metadata_yaml = yaml.safe_load(args.metadata)
                #args.metadata_yaml["yamlpath"] = os.path.dirname(os.path.abspath(args.metadata.name))

                args.source_lang = args.metadata_yaml["source_lang"]
                args.target_lang = args.metadata_yaml["target_lang"]
            except:
                traceback.print_exc()
                logging.error(
                    "Error retrieving source or target languages from metadata."
                )
                sys.exit(1)

    if args.disable_lm_filter:
        logging.info("LM filtering disabled.")
    if args.disable_porn_removal:
        logging.info("P**n removal disabled.")

    return args
Пример #31
0
def prediction(n_clicks, uploaded_filenames):
    threshold = 0.80
    global mapping_dict
    print(n_clicks, uploaded_filenames)
    if n_clicks is not None and uploaded_filenames is not None and threshold is not None:
        page_text = ''
        doc = fitz.open(".\\assets\\docs\\" + uploaded_filenames[0])
        print(doc.pageCount)
        for i in range(doc.pageCount):
            page = doc.loadPage(i)
            page_str = page.getText("text")
            page_text = page_text + page_str
        text = page_text.lower()

        # /*********************Remove number*******************/
        text = re.sub(r'\d+', ' ', text)
        # /*****************Remove Punctuation****************/
        text = re.sub(r'[^\w\s]', ' ', text)
        # /*****************Remove \xa0****************/
        text = re.sub(r'\xa0', '', text)
        # /*****************Remove \x0c****************/
        text = re.sub(r'\x0c', '', text)
        #    /*****************Remove stop words************/
        token_text = word_tokenize(text)
        tokens_without_sw = [word for word in token_text if not word in stop_words]
        text_stem = [ps.stem(word) for word in tokens_without_sw]
        text = (" ").join(text_stem)
        # /***************Remove space line character*********/
        text = text.replace('\n', ' ')
        # /********************Remove duplicate space**********/
        text = " ".join(text.split())  # /**********Common word removal************/

        model = fasttext.load_model(join(project_root,'RTA_Future_Scanner.bin'))
        predicted_label_1 = model.predict(text, k=-1)[0][0]
        predicted_label_1_probab = model.predict(text, k=-1)[1][0]

        predicted_label_2 = model.predict(text, k=-1)[0][1]
        predicted_label_2_probab = model.predict(text, k=-1)[1][1]
        predicted_label_3 = model.predict(text, k=-1)[0][2]
        predicted_label_3_probab = model.predict(text, k=-1)[1][2]
        predicted_label_4 = model.predict(text, k=-1)[0][3]
        predicted_label_4_probab = model.predict(text, k=-1)[1][3]

        predicted_label_1 = predicted_label_1.replace("__label__", '').replace("__n_", '')
        predicted_label_2 = predicted_label_2.replace("__label__", '').replace("__n_", '')
        predicted_label_3 = predicted_label_3.replace("__label__", '').replace("__n_", '')
        predicted_label_4 = predicted_label_4.replace("__label__", '').replace("__n_", '')

        predicted_label_1 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_1))
        predicted_label_1 = mapping_dict.get("".join(predicted_label_1.lower().split(" ")), predicted_label_1)
        predicted_label_2 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_2))
        predicted_label_2 = mapping_dict.get("".join(predicted_label_2.lower().split(" ")), predicted_label_2)
        predicted_label_3 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_3))
        predicted_label_3 = mapping_dict.get("".join(predicted_label_3.lower().split(" ")), predicted_label_3)
        predicted_label_4 = " ".join(re.findall('[3]*[A-Z][a-z]*', predicted_label_4))
        predicted_label_4 = mapping_dict.get("".join(predicted_label_4.lower().split(" ")), predicted_label_4)

        Confidence_Score = '-'
        j1 = ""
        j2 = 0
        j3 = ""
        j4 = 0
        j5 = ""
        j6 = 0

        if predicted_label_1_probab >= threshold:
            Sample1 = []
            Sample1.append(predicted_label_1)
            Sample1.append(predicted_label_1_probab)
            j1 = (Sample1[0])
            j2 = round(Sample1[1], 2)
        elif (predicted_label_1_probab + predicted_label_2_probab) >= threshold:
            Sample1 = []
            Sample1.append(predicted_label_1)
            Sample1.append(predicted_label_1_probab)
            Sample2 = []
            Sample2.append(predicted_label_2)
            Sample2.append(predicted_label_2_probab)
            j1 = (Sample1[0])
            j2 = round(Sample1[1], 2)
            j3 = Sample2[0]
            j4 = round(Sample2[1], 2)
        elif (predicted_label_1_probab + predicted_label_2_probab + predicted_label_3_probab) >= threshold:
            Sample1 = []
            Sample1.append(predicted_label_1)
            Sample1.append(predicted_label_1_probab)
            Sample2 = []
            Sample2.append(predicted_label_2)
            Sample2.append(predicted_label_2_probab)
            Sample3 = []
            Sample3.append(predicted_label_3)
            Sample3.append(predicted_label_3_probab)
            j1 = (Sample1[0])
            j2 = round(Sample1[1], 2)
            j3 = Sample2[0]
            j4 = round(Sample2[1], 2)
            j5 = Sample3[0]
            j6 = round(Sample3[1], 2)
        else:
            j1 = '-'
            j2 = '-'
            j3 = '-'
            j4 = '-'
            j5 = '-'
            j6 = '-'
        j2 = str(j2 * 100) + " %" if j2 != "-" else str(j2)
        j4 = str(j4 * 100) + " %" if j4 != "-" else str(j4)
        j6 = str(j6 * 100) + " %" if j6 != "-" else str(j6)

        text_lem = [wn.lemmatize(word) for word in tokens_without_sw]
        word_text = (" ").join(text_lem)
        # /***************Remove space line character*********/
        word_text = word_text.replace('\n', ' ')
        #    /********************Remove duplicate space**********/
        word_text = " ".join(word_text.split())
        tfidf_vectorizer = TfidfVectorizer()
        top_unigram_words = wc(word_text, tfidf_vectorizer)
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
        top_bigram_words = wc(word_text, tfidf_vectorizer)
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(3, 3))
        top_trigram_words = wc(word_text, tfidf_vectorizer)
        test = []
        test = top_unigram_words
        # + top_bigram_words + top_trigram_words
        d = {}
        for a, x in test:
            d[a] = 10 * x

        wordcloud = WordCloud(width=1450, height=700, background_color='white')
        wordcloud.generate_from_frequencies(frequencies=d)
        wordcloud.to_file(join(data_Path,'wc.png'))

        # words = list(d.keys())
        # weights = [round(each) for each in list(d.values())]
        wordcloud_fig = word_cloud()
        df = {
            "Prediction": [j1, j3, j5],
            "Probability": [j2, j4, j6],
        }

        df = pd.DataFrame(df)
        df = df[df['Prediction'] != ""]
        # fig.show()
        print("reached here=====================================================1")

        return df.to_dict('records'), [{"name": i, "id": i} for i in df.columns], wordcloud_fig, ""
Пример #32
0
import fasttext

model = fasttext.load_model(
    '/users/aaronrank/developer/recipe-ai/recipeai/recipes/.ingredient_classifier_model'
)


def predict(text):
    '''Returns label, confidence'''
    clf = model.predict(text)
    return clf[0][0], clf[1][0]
Пример #33
0
def prep_emb(fn, gen_emb, domain_emb, prep_dir, gen_dim=300, domain_dim=100):
    text = []
    with open(fn) as f:
        for line in f:
            ob = json.loads(line)
            review = ob["text"]
            token = word_tokenize(review)
            text=text+token
    vocab = sorted(set(text))
    word_idx = {}
    if os.path.exists(prep_dir+'word_idx.json'):
        with io.open(prep_dir+'word_idx.json') as f:
            prev_word = json.load(f)
    else:
        prev_word = {}
    wx = 0
    new_word = []
    for word in vocab:
        if word not in prev_word:
            wx = wx+1
            new_word.append(word)
            word_idx[word] = wx+len(prev_word)
    prev_word.update(word_idx)          
    if new_word == []:
        return
    # create embedding
    embedding_gen=np.zeros((len(prev_word)+2, gen_dim) )
    embedding_domain=np.zeros((len(prev_word)+2, domain_dim) )    
    if os.path.exists(prep_dir+'gen.vec.npy'):
        gen_emb_prev=np.load(prep_dir+"gen.vec.npy")
        embedding_gen[:gen_emb_prev.shape[0],:] = gen_emb_prev
    if os.path.exists(prep_dir+'gen.vec.npy'):
        domain_emb_prev=np.load(prep_dir+'restaurant_emb.vec.npy')
        embedding_domain[:domain_emb_prev.shape[0],:] = domain_emb_prev
    with open(gen_emb) as f:
        # read the embedding .vec file
        for l in f:
            rec=l.rstrip().split(' ')
            if len(rec)==2: #skip the first line.
                continue 
            # if the word in word_idx, fill the embedding
            if rec[0] in new_word:
                embedding_gen[prev_word[rec[0]]] = np.array([float(r) for r in rec[1:] ])
    with open(domain_emb) as f:
        # read the embedding .vec file
        for l in f:
            # for each line, get the word and its vector
            rec=l.rstrip().split(' ')
            if len(rec)==2: #skip the first line.
                continue
            # if the word in word_idx, fill the embedding
            if rec[0] in new_word:
                embedding_domain[prev_word[rec[0]]] = np.array([float(r) for r in rec[1:] ])
    ftmodel = load_model(domain_emb+".bin")
    for w in new_word:
        if embedding_domain[word_idx[w] ].sum()==0.:
            embedding_domain[word_idx[w] ] = ftmodel.get_word_vector(w)
    with io.open(prep_dir+'word_idx.json', 'w') as outfile:
        outfile.write(json.dumps(prev_word)) 
    np.save(prep_dir+'gen.vec.npy', embedding_gen.astype('float32') )
    np.save(prep_dir+'restaurant_emb.vec.npy', embedding_domain.astype('float32') )    
        '../../TSD/augmented_labels/data/normalized/transcripts/swedish/test.txt'
    )
    tags_test = prepare_data.load_tags(
        '../../TSD/augmented_labels/data/normalized/ner/swedish/ner_test.txt')

    # compare againt conventional NER
    #features_test = prepare_data.load_features_combined('../augmented_labels/data/normalized/features/test.npy')
    #target_test = prepare_data.load_transcripts('output/parliament/e2e_asr_combined.txt')
    #tags_test = prepare_data.load_tags('output/parliament/conventional_ner.txt')

    features_test = features_test[:50]
    target_test = target_test[:50]
    tags_test = tags_test[:50]

    print('Loading embeddings...')
    embeddings = fasttext.load_model('weights/embeddings/cc.sv.300.bin')
    print('Done...')

    tag2idx = {'O': 1, 'PER': 2, 'LOC': 3, 'ORG': 4}
    idx2tag = {1: 'O', 2: 'PER', 3: 'LOC', 4: 'ORG'}

    with open('weights/char2idx_swe.pkl', 'rb') as f:
        char2idx = pickle.load(f)
    with open('weights/idx2char_swe.pkl', 'rb') as f:
        idx2char = pickle.load(f)

    char2idx['~'] = len(char2idx) + 1
    idx2char[len(idx2char) + 1] = '~'

    char2idx_ctc = {}
    idx2char_ctc = {}
Пример #35
0
# 	trans.overSampling(file_prefix + train_file, "0", "1")
# 	trans.overSampling(file_prefix + valid_file, "0", "1")
# 	trans.overSampling(file_prefix + test_file, "0", "1")
# else:
# 	trans.overSampling(file_prefix + train_file, "1", "2", "3")
# 	trans.overSampling(file_prefix + valid_file, "1", "2", "3")
# 	trans.overSampling(file_prefix + test_file, "1", "2", "3")
""" get and save the training model """
model = [None for i in range(4)]
# for i in range(1, 4):
# file_prefix = "villa/stage" + str(i) + "/"
# model[i] = anal.train(file_prefix + train_file, file_prefix + valid_file)
# model[i].save_model("villa/model_stage" + str(i) + "_jdComment.bin")
""" test the training model """
for i in range(1, 4):
    model[i] = fasttext.load_model("villa/model_stage" + str(i) +
                                   "_jdComment.bin")
    # print(model[i].test("villa/stage" + str(i) + "/" + test_file))

tot = 0
bingo = 0
with open("villa/" + test_file) as infile:
    for row in infile:
        i = 9
        tag = ""
        while row[i] != ' ':
            tag += row[i]
            i += 1
        res = anal.predictComment(model, row[i + 1:-2])
        if res == tag:
            bingo += 1
        tot += 1
Пример #36
0
 def load(self, path):
     return fasttext.load_model(path)
Пример #37
0
 def load(cls, load_dir, batch_size=4, gpu=False, embedder_only=True):
     import fasttext
     if os.path.isfile(load_dir):
         return cls(model=fasttext.load_model(load_dir))
     else:
         logger.error(f"Fasttext model file does not exist at: {load_dir}")
Пример #38
0
def main():
    data_path = '/Users/ruizhang/Documents/NLP_dataset/'


    #############
    #
    ############
    # Load train set
    train_file = data_path +'dbpedia_csv/train.csv'
    df = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'dbpedia_csv/test.csv'
    df_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Mapping from class number to class name
    class_dict = {
        1: 'Company',
        2: 'EducationalInstitution',
        3: 'Artist',
        4: 'Athlete',
        5: 'OfficeHolder',
        6: 'MeanOfTransportation',
        7: 'Building',
        8: 'NaturalPlace',
        9: 'Village',
        10: 'Animal',
        11: 'Plant',
        12: 'Album',
        13: 'Film',
        14: 'WrittenWork'
    }
    df['class_name'] = df['class'].map(class_dict)
    df.head()

    #############
    #
    ############
    desc = df.groupby('class')
    desc.describe().transpose()

    # Transform datasets
    df_train_clean = clean_dataset(df, True, False)
    df_test_clean = clean_dataset(df_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'dbpedia.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'dbpedia.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    # Train a classifier
    output_file = data_path + 'dp_model'
    classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__')

    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    sentence1 = ['Picasso was a famous painter born in Malaga, Spain. He revolutionized the art in the 20th century.']
    labels1 = classifier.predict(sentence1)
    class1 = int(labels1[0][0])
    print("Sentence: ", sentence1[0])
    print("Label: %d; label name: %s" % (class1, class_dict[class1]))

    sentence2 = ['One of my favourite tennis players in the world is Rafa Nadal.']
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class2 as string
    print("Sentence: ", sentence2[0])
    print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    sentence3 = ['Say what one more time, I dare you, I double-dare you m**********r!']
    number_responses = 3
    labels3 = classifier.predict_proba(sentence3, k=number_responses)
    print("Sentence: ", sentence3[0])
    for l in range(number_responses):
        class3, prob3 = labels3[0][l]
        print("Label: %s; label name: %s; certainty: %f" % (class3, class_dict[int(class3)], prob3))

    # Load train set
    train_file = data_path + 'amazon_review_polarity_train.csv'
    df_sentiment_train = pd.read_csv(train_file, header=None, names=['class', 'name', 'description'])

    # Load test set
    test_file = data_path + 'amazon_review_polarity_test.csv'
    df_sentiment_test = pd.read_csv(test_file, header=None, names=['class', 'name', 'description'])

    # Transform datasets
    df_train_clean = clean_dataset(df_sentiment_train, True, False)
    df_test_clean = clean_dataset(df_sentiment_test, False, False)

    # Write files to disk
    train_file_clean = data_path + 'amazon.train'
    df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    test_file_clean = data_path + 'amazon.test'
    df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class', 'name', 'description'])

    dim = 10
    lr = 0.1
    epoch = 5
    min_count = 1
    word_ngrams = 2
    bucket = 10000000
    thread = 12
    label_prefix = '__label__'

    # Train a classifier
    output_file = data_path + 'amazon_model'
    classifier = fasttext.supervised(train_file_clean, output_file, dim=dim, lr=lr, epoch=epoch,
                                     min_count=min_count, word_ngrams=word_ngrams, bucket=bucket,
                                     thread=thread, label_prefix=label_prefix)

    # Evaluate classifier
    result = classifier.test(test_file_clean)
    print('P@1:', result.precision)
    print('R@1:', result.recall)
    print('Number of examples:', result.nexamples)

    class_dict = {
        1: "Negative",
        2: "Positive"
    }

    sentence1 = ["The product design is nice but it's working as expected"]
    labels1 = classifier.predict_proba(sentence1)
    class1, prob1 = labels1[0][0]  # it returns class as string
    print("Sentence: ", sentence1[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class1, class_dict[int(class1)], prob1))

    sentence2 = ["I bought the product a month ago and it was working correctly. But now is not working great"]
    labels2 = classifier.predict_proba(sentence2)
    class2, prob2 = labels2[0][0]  # it returns class as string
    print("Sentence: ", sentence2[0])
    # print("Label: %s; label name: %s; certainty: %f" % (class2, class_dict[int(class2)], prob2))

    url = "https://twitter.com/miguelgfierro/status/805827479139192832"
    response = urlopen(url).read()
    title = str(response).split('<title>')[1].split('</title>')[0]
    print(title)

    # # Format tweet
    # tweet = unescape(title)
    # print(tweet)
    #
    # # Classify tweet
    # label_tweet = classifier.predict_proba([tweet])
    # class_tweet, prob_tweet = label_tweet[0][0]
    # print("Label: %s; label name: %s; certainty: %f" % (class_tweet, class_dict[int(class_tweet)], prob_tweet))


    wiki_dataset_original = data_path + 'enwik9'
    wiki_dataset = data_path + 'text9'
    if not os.path.isfile(wiki_dataset):
        os.system("perl wikifil.pl " + wiki_dataset_original + " > " + wiki_dataset)

    output_skipgram = data_path + 'skipgram'
    if os.path.isfile(output_skipgram + '.bin'):
        skipgram = fasttext.load_model(output_skipgram + '.bin')
    else:
        skipgram = fasttext.skipgram(wiki_dataset, output_skipgram, lr=0.02, dim=50, ws=5,
                                     epoch=1, min_count=5, neg=5, loss='ns', bucket=2000000, minn=3, maxn=6,
                                     thread=4, t=1e-4, lr_update_rate=100)
    print(np.asarray(skipgram['king']))

    print("Number of words in the model: ", len(skipgram.words))

    # Get the vector of some word
    Droyals = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['queen']), 2)).sum()
    print(Droyals)
    Dpeople = np.sqrt(pow(np.asarray(skipgram['king']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople)
    Dpeople2 = np.sqrt(pow(np.asarray(skipgram['man']) - np.asarray(skipgram['woman']), 2)).sum()
    print(Dpeople2)

    print(len(skipgram.words))
    targets = ['man', 'woman', 'king', 'queen', 'brother', 'sister', 'father', 'mother', 'grandfather', 'grandmother',
               'cat', 'dog', 'bird', 'squirrel', 'horse', 'pig', 'dove', 'wolf', 'kitten', 'puppy']
    classes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
               2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
    X_target = []
    for w in targets:
        X_target.append(skipgram[w])
    X_target = np.asarray(X_target)
    word_list = list(skipgram.words)[:10000]
    X_subset = []
    for w in word_list:
        X_subset.append(skipgram[w])
    X_subset = np.asarray(X_subset)
    X_target = np.concatenate((X_subset, X_target))
    print(X_target.shape)
    X_tsne = TSNE(n_components=2, perplexity=40, init='pca', method='exact',
                  random_state=0, n_iter=200, verbose=2).fit_transform(X_target)
    print(X_tsne.shape)
    X_tsne_target = X_tsne[-20:, :]
    print(X_tsne_target.shape)
    plot_words(X_tsne_target, targets, classes=classes)
    plot_words(X_tsne_target, targets, xlimits=[0.5, 0.7], ylimits=[-3.7, -3.6])
Пример #39
0
 def __init__(self):
     self.model = fasttext.load_model("model.ftz")
Пример #40
0
# _*_conding:utf8 _*_
import jieba
import re
import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import fasttext
import os

#######训练模型###########
model_path = './tmp/senti_model.model'
if os.path.exists(model_path + '.bin'):
    classifier = fasttext.load_model(model_path + '.bin')
else:
    train_file = './tmp/training.txt'
    # model_path = './experiment/fasttext-classification/senti_model.model'
    classifier = fasttext.supervised(train_file,
                                     model_path,
                                     label_prefix="__label__")

    test_file = './tmp/test.txt'
    result = classifier.test(test_file)
    print(result.precision)
    print(result.recall)

########加载停用词列表#######
stop_word_path = '../stopwords_cn.txt'
stop_word = []
with open(stop_word_path, 'r') as f:
    for line in f.readlines():
        stop_word.append(line.strip())
punction_list = list('、,。?!:;“”¥%&*@~#()】【,.?!;:" "')
Пример #41
0
 def __init__(self, model):
     self.model = fasttext.load_model(model)