Exemplo n.º 1
0
class FastTextEmbedding(Embedding):

  def __init__(self, binfile, normalize = False):
    self.file = binfile
    self.vdim = -1
    self.normalize = normalize
    
  def load(self):
    print('Loading fasttext model.')
    self.ftmodel = FastText()
    self.ftmodel.load_model(self.file)
    self.vdim = len(self.ftmodel['is'])
    print('Finished loading fasttext model.')
    return self
  
  def getVector(self, word):
    return self.ftmodel.get_numpy_vector(word, normalized = self.normalize)
    
  def search(self, q, topk = 4):
    raise NotImplementedError()
    
  def wordForVec(self, v):
    word, sim = self.ftmodel.words_for_vector(v)[0]
    return word, sim
  
  def containsWord(self, word):
    return True
  
  def vocabulary(self):
    return self.ftmodel.words
  
  def dim(self):
    return self.vdim
Exemplo n.º 2
0
    def predict(self, test_set, test_labels_vector=None, report_accuracy=True):
        """
        uses the trained model to predict the test set
        :param test_set: the test set
        :param test_labels_vector: the labels vector of the test set for accuracy computation
        :param report_accuracy: defines whether to report the prediction or not
        """

        if self.model_name:
            from pyfasttext import FastText
            predictor = FastText()
            predictor.load_model('ft_extras/'+self.model_name+'.bin')
            predicted_labels = predictor.predict_proba(test_set)
            if report_accuracy and test_labels_vector:
                test_set_size = len(test_set)
                correct_predictions = 0
                invalid_labels = 0
                for index, labels in enumerate(predicted_labels):
                    if len(labels) != 0:
                        best_label = max(labels,key=lambda label:label[1])
                        if best_label[0] == test_labels_vector[index]:
                            correct_predictions += 1
                    else:
                        invalid_labels += 1
                        continue
                print('Prediction accuracy:{}\n'.format(correct_predictions / (test_set_size - invalid_labels)))
        else:
            print('Please use the train method to train a model first.')
            return
Exemplo n.º 3
0
def get_language(text):
    """Given a list of lines, return a list of (line, lang)"""
    if not hasattr(settings, '_lang_detector'):
        lid_model = FastText()
        lid_model.load_model(settings.LID_MODEL_PATH)
        settings._lang_detector = lid_model
    langs = settings._lang_detector.predict([text])
    return langs[0]
Exemplo n.º 4
0
def pyfasttext_sample():
    """https://pypi.org/project/pyfasttext/
    """
    model = FastText()
    # model.load_model('output/model_cooking_6.bin')
    model.load_model('output/model_cooking_5.ftz')
    result = model.predict_file('data/cooking/pre_cooking.valid', 2)
    for i, r in enumerate(result):
        print(i, r)
Exemplo n.º 5
0
def init():
    global processtext
    processtext = ProcessText()
    
    global labels_list
    with open("both_labels.pkl", "rb") as f:
        labels_list = pickle.load(f)
    
    global contcmp
    contcmp = ContCmp("root_feature_file.allid")
    #loadModel()
    
    global fasttext_model
    fasttext_model = FastText()
    fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
Exemplo n.º 6
0
def validate_model(model_file, validate_file):
    model = FastText()
    model.load_model(model_file + '.bin')

    validate = pd.read_csv(validate_file,
                           sep='\t',
                           names=['label', 'review'],
                           lineterminator='\n')

    validate['predict'] = validate.apply(
        lambda row: predict_label(model, row['review']), axis=1)

    # all_all_count = validate.shape[0]
    good_good = validate.query(
        " predict == 'GOOD' and label == '__label__GOOD' ").shape[0]
    bad_bad = validate.query(
        " predict == 'BAD' and label == '__label__BAD' ").shape[0]
    # neutral_neutral = validate.query(" predict == 'NEUTRAL' and label == '__label__NEUTRAL' ").shape[0]

    all_good = validate.query(" label == '__label__GOOD' ").shape[0]
    all_bad = validate.query(" label == '__label__BAD' ").shape[0]
    # all_neutral = validate.query(" label == '__label__NEUTRAL' ").shape[0]

    good_all = validate.query(" predict == 'GOOD' ").shape[0]
    bad_all = validate.query(" predict == 'BAD' ").shape[0]
    # neutral_all = validate.query(" predict == 'NEUTRAL' ").shape[0]

    # micro_precision = (good_good + bad_bad + neutral_neutral)/all_all_count
    # micro_recall = micro_precision
    # micro_f1 = micro_precision
    micro_precision = good_good / good_all
    micro_recall = good_good / all_good
    micro_f1 = (2 * micro_precision * micro_recall) / (micro_precision +
                                                       micro_recall)

    # macro_precision = ((good_good/good_all) + (bad_bad/bad_all) + (neutral_neutral/neutral_all))/3
    # macro_recall = ((good_good/all_good) + (bad_bad/all_bad) + (neutral_neutral/all_neutral))/3
    macro_precision = ((good_good / good_all) + (bad_bad / bad_all)) / 2
    macro_recall = ((good_good / all_good) + (bad_bad / all_bad)) / 2
    macro_f1 = (2 * macro_precision * macro_recall) / (macro_precision +
                                                       macro_recall)

    return micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1
Exemplo n.º 7
0
class FastTextEmbedding(Embedding):
    def __init__(self, binfile, normalize=False):
        self.file = binfile
        self.vdim = -1
        self.normalize = normalize

    def load(self):
        print('Loading fasttext model.')
        self.ftmodel = FastText()
        self.ftmodel.load_model(self.file)
        self.vdim = len(self.ftmodel['is'])
        print('Finished loading fasttext model.')
        return self

    def getVector(self, word):
        return self.ftmodel.get_numpy_vector(word, normalized=self.normalize)

    def wordForVec(self, v):
        word, sim = self.ftmodel.words_for_vector(v)[0]
        return word, sim

    def nearest_neighbors(self, word, n=200):
        tuples = ftmodel.nearest_neighbors(word, n)
        return tuples

    def nearest_neighbors_by_vector(self, word, n=200):
        tuples = self.ftmodel.words_for_vector(v, n)
        return tuples

    def containsWord(self, word, explicit=False):
        if explicit:
            return word in vocabulary()
        return True

    def vocabulary(self):
        return self.ftmodel.words

    def dim(self):
        return self.vdim
Exemplo n.º 8
0
    business_review_score = business_raw.merge(review_processed_short,
                                               on=['business_id'])
    business_review_score = business_review_score[[
        'business_id', 'name', 'city', 'state', 'stars', 'review_count',
        'categories', 'review_stars', 'text', 'predict_score'
    ]]
    business_review_score_short = business_review_score.query(
        " stars in ('1', '5')")
    business_review_score_short.to_csv(const.business_review_score_file_name,
                                       sep='\t',
                                       index=False,
                                       header=False)

    logger.info('end process:' + sys._getframe().f_code.co_name)


if __name__ == '__main__':

    if len(sys.argv) <= 1:
        raise Exception(
            'Need to run command as python assign.py [path_to_model_bin_file]')
    else:
        model_file = sys.argv[1]

    model = FastText()
    model.load_model(model_file)

    assign_score_to_review(model)
    assign_score_to_review_with_text()
Exemplo n.º 9
0
import math
import json
import string

N_GRAM = 3
WORD_SIZE = 6
VECTOR_NUM = 300
MODEL_PATH = 'model.bin'
# Gordugu karakterleri digeriyle degistiriyor
TO_LOWER = str.maketrans('ABCÇDEFGĞHIIJKLMNOÖPQRSŞTUÜVWXYZ',
                         'abcçdefgğhıijklmnoöpqrsştuüvwxyz', '’“”')

# Model Dosyamız yükleniyor.
print('Vektörlerimiz Yükleniyor... Lütfen Bekleyiniz...')
model = FastText()
model.load_model(MODEL_PATH)

# def load_model():
#     model = FastText()
#     model.load_model(MODEL_PATH)
#     return model


# kelime vektorunden cumle vektorune
def sent2Vec(sentence):
    sentenceVector = [0.0] * VECTOR_NUM
    split_sentence = sentence.split()
    for word in split_sentence:
        wordVector = model[word]
        for i in range(VECTOR_NUM):
            sentenceVector[i] += wordVector[i]
Exemplo n.º 10
0
def loadModel():
    global fasttext_model
    fasttext_model = FastText()
    fasttext_model.load_model('3Ngram_3mincount_1wminlabel.bin')
Exemplo n.º 11
0
from sentiment_analysis import load_sentiment_model, load_tagging_model, sentiment_analysis
from mem_absa.load_data import init_word_embeddings
from mem_absa.load_data import read_sample
from mem_absa.mapping import mapping_sentiments

import configuration as configuration

import mysql.connector

import spacy
from pyfasttext import FastText

fr_nlp = spacy.load("fr")

wiki_model = FastText()
wiki_model.load_model(configuration.pathFasttext)

# connection à la BD
conn = mysql.connector.connect(host="localhost",
                               user="******",
                               password="******",
                               database="resolution")


def generate_review_data():
    cursor = conn.cursor()
    cursor.execute(
        "select distinct code_etab from resolution.contributions_etab_nihel where code_an8=54053000 limit 300"
    )
    etabs = cursor.fetchall()
    data = []
Exemplo n.º 12
0
import jieba


SERVICE_URL_SS = "http://127.0.0.1:8001/service"
SERVICE_URL_ZD2 = "http://127.0.0.1:8002/service" #大粒度
SERVICE_URL_ZD = "http://127.0.0.1:8006/service" #小粒度
SERVICE_URL_ZD_SEG = "http://127.0.0.1:8006/seg" #小粒度
# SERVICE_URL_SS = "http://172.19.91.91:8001/service"
# SERVICE_URL_ZD = "http://172.19.91.91:8002/service"

HEADERS = {'content-type': 'application/json'}


#bcjl_model:分词粒度大(如 肾积水),bcjl_model1:分词粒度小(如 肾/积水)
ft_model=FastText()
ft_model.load_model('../../model/bcjl_model.bin')
# ft_model=""

SYN_THRESHOLD=0.75
SIMILARITY_THRESHOLD=0.8

conn = MongoClient('localhost', 27017)
db = conn.bzxt1

def file_compare(f1_path,f2_path):
    # 比较两个文件是否相同
    def get_file_md5(f):
        m = hashlib.md5()
        while True:
            #如果不用二进制打开文件,则需要先编码
            #data = f.read(1024).encode('utf-8')
Exemplo n.º 13
0
import re
from mem_absa.load_data import init_word_embeddings
from mem_absa.load_data import read_sample, read_vocabulary
from mem_absa.mapping import mapping_sentiments
from mem_absa.model import MemN2N
from mem_absa.config_mem import Configure

import spacy
fr_nlp = spacy.load("fr")
path = ".."
configure = Configure()
FLAGS = configure.get_flags(path)

from pyfasttext import FastText
wiki_model = FastText()
wiki_model.load_model(FLAGS.pathFasttext)


def main(_):
    configure.pp.pprint(FLAGS.__flags)
    source_count = []
    source_word2idx = {}

    read_vocabulary(fr_nlp, FLAGS.train_data, source_count, source_word2idx)

    print('loading pre-trained word vectors...')
    FLAGS.pre_trained_context_wt = init_word_embeddings(
        wiki_model, source_word2idx, FLAGS.nbwords)
    FLAGS.pre_trained_context_wt[FLAGS.pad_idx, :] = 0

    model = MemN2N(FLAGS)
Exemplo n.º 14
0
from commons.logger import logger, configure
from commons import configuration

import configuration as confi

from sentiment_analysis import load_sentiment_model, load_tagging_model, sentiment_analysis

import spacy
from pyfasttext import FastText

# fr dictionary loading
fr_nlp = spacy.load("fr")

wiki_model = FastText()
wiki_model.load_model(confi.pathFasttext)

import traceback

# Chargement de la conf
conf = configuration.load()
script_dir = os.path.dirname(__file__)


def _init_app(p_conf):
    # Configuration du logger
    configure(conf['log']['level_values'][conf['log']['level']],
              conf['log']['dir'], conf['log']['filename'],
              conf['log']['max_filesize'], conf['log']['max_files'])

    # Load app config into Flask WSGI running instance
Exemplo n.º 15
0
import numpy as np
import os
from pyfasttext import FastText
import string
import re

# shared global variables to be imported from model also
UNK = "$UNK$"
NUM = "$NUM$"
NONE = "O"

path = '/Users/nkooli/Documents/docs/avis/review_analysis_pj/mem_absa'
wiki_model = FastText()
wiki_model.load_model(path + '/model_pyfasttext100.bin')


# special error message
class MyIOError(Exception):
    def __init__(self, filename):
        # custom error message
        message = """
ERROR: Unable to locate file {}.

FIX: Have you tried running python build_data.py first?
This will build vocab file from your train, test and dev sets and
trimm your word vectors.
""".format(filename)
        super(MyIOError, self).__init__(message)


class CoNLLDataset(object):