예제 #1
0
    def test_subword_vector(self):
        f = load_model(self.output + '.bin')
        words, _ = f.get_words(include_freq=True)
        words += get_random_words(10000, 1, 200)
        input_matrix = f.get_input_matrix()
        for word in words:

            # Universal api to get word vector
            vec1 = f.get_word_vector(word)

            # Build word vector from subwords
            subwords, subinds = f.get_subwords(word)
            subvectors = list(map(lambda x: f.get_input_vector(x), subinds))
            subvectors = np.stack(subvectors)
            vec2 = np.sum((subvectors / len(subwords)), 0)

            # Build word vector from subinds
            vec3 = np.sum(input_matrix[subinds] / len(subinds), 0)

            # Build word vectors from word and subword ids
            wid = f.get_word_id(word)
            if wid >= 0:
                swids = list(map(lambda x: f.get_subword_id(x), subwords[1:]))
                swids.append(wid)
            else:
                swids = list(map(lambda x: f.get_subword_id(x), subwords))
            swids = np.array(swids)
            vec4 = np.sum(input_matrix[swids] / len(swids), 0)

            self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all())
            self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all())
예제 #2
0
 def test_vocab(self):
     f = load_model(self.output + '.bin')
     words, freq = f.get_words(include_freq=True)
     self.eprint(
         "There is no way to access words from the cli yet. "
         "Therefore there can be no rigorous test."
     )
예제 #3
0
 def test_subwords(self):
     f = load_model(self.output + '.bin')
     words, _ = f.get_words(include_freq=True)
     words += get_random_words(10, 1, 10)
     for w in words:
         f.get_subwords(w)
     self.eprint(
         "There is no way to access words from the cli yet. "
         "Therefore there can be no test."
     )
예제 #4
0
파일: utils.py 프로젝트: tpetmanson/MUSE
def load_fasttext_model(path):
    """
    Load a binarized fastText model.
    """
    try:
        import fastText
    except ImportError:
        raise Exception("Unable to import fastText. Please install fastText for Python: "
                        "https://github.com/facebookresearch/fastText")
    return fastText.load_model(path)
예제 #5
0
파일: tagger.py 프로젝트: CaliOpen/Caliopen
 def __init__(self, model_name="model_cat1", k=5, threshold=0):
     try:
         self.model = fastText.load_model(
             resources_path + "models/{}.bin".format(model_name)
         )
         log.info('Load tagging model {}'.format(model_name))
     except ValueError as exc:
         log.error(
             'Error loading tagging model {}: {}'.format(model_name, exc)
         )
         raise exc
     self.k = k
     self.threshold = threshold
예제 #6
0
 def test_getvector(self):
     f = load_model(self.output + '.bin')
     words, _ = f.get_words(include_freq=True)
     words += get_random_words(100, 1, 100)
     ftbin_vectors = self.get_word_vectors_from_list(self.output, words)
     ftbin_vectors = ftbin_vectors.decode('utf-8').split('\n')[:-1]
     for v in ftbin_vectors:
         word = v.split(' ')[0]
         vector = v.split(' ')[1:-1]
         vector = np.array(list(map(float, vector)))
         pvec = f.get_word_vector(word)
         # The fasttext cli returns floats with 5 digits,
         # but we use the full 6 digits.
         self.assertTrue(np.allclose(vector, pvec, rtol=1e-04))
예제 #7
0
def get_word_vector(data, model):
    t1 = time.time()
    print("Reading")
    with open(data, 'r') as f:
        tokens = tokenize(f.read())
    t2 = time.time()
    print("Read TIME: " + str(t2 - t1))
    print("Read NUM : " + str(len(tokens)))
    f = load_model(model)
    # This is not equivalent to piping the data into
    # print-word-vector, because the data is tokenized
    # first.
    t3 = time.time()
    i = 0
    for t in tokens:
        f.get_word_vector(t)
        i += 1
        if i % 10000 == 0:
            sys.stderr.write("\ri: " + str(float(i / len(tokens))))
            sys.stderr.flush()
    t4 = time.time()
    print("\nVectoring: " + str(t4 - t3))
예제 #8
0
    def load(self, *args, **kwargs) -> Fasttext.FastText._FastText:
        """
        Load fastText binary model from self.load_path

        Args:
            *args: arguments
            **kwargs: arguments

        Returns:
            fastText pre-trained model
        """

        if self.load_path and self.load_path.is_file():
            log.info("[loading embeddings from `{}`]".format(self.load_path))
            model_file = str(self.load_path)
            model = Fasttext.load_model(model_file)
        else:
            log.error('No pretrained fasttext model provided or provided load_path "{}" is incorrect.'
                      .format(self.load_path))
            sys.exit(1)

        return model
예제 #9
0
            prev_time = m.timestamp
        feat_vector.append(curr_cnt)
        # print(user, len(feat_vector))
        user_density[user] = feat_vector
    return user_density


def calculate_chat_density(user_time):
    user_density = {}
    for username in user_time:
        times = user_time[username]
        if max(times) != min(times):
            density = len(times) / (max(times) - min(times))
            if density > 1e-2:
                continue
            user_density[username] = density * 1e5
            print(username, user_density[username])
    return user_density


if __name__ == "__main__":
    user_log = parse_log("chat_log_target.csv")
    # load_data(user_log, output_file="pretrain_data.txt")
    # print("generating model...")
    # model = train_unsupervised(input="pretrain_data.txt", model='skipgram', dim=500)
    # model.save_model("pretrain_token.bin")

    model = fastText.load_model("pretrain_token.bin")
    feat_path = "fasttext_feat_500_sent_target/"
    generate_fasttext_embeddings(user_log, model)
예제 #10
0
    def test_predict(self):
        # TODO: I went a little crazy here as an exercise for
        # a rigorous test case. This could be turned into
        # a few utility functions.
        f = load_model(self.output_sup + '.bin')

        def _test(N, min_length, max_length, k, add_vocab=0):
            words = get_random_words(N, min_length, max_length)
            if add_vocab > 0:
                vocab, _ = f.get_words(include_freq=True)
                for _ in range(add_vocab):
                    ind = random.randint(0, len(vocab))
                    words += [vocab[ind]]
            all_labels = []
            all_probs = []
            ii = 0
            gotError = False
            for w in words:
                try:
                    labels, probs = f.predict(w, k)
                except ValueError:
                    gotError = True
                    continue
                all_labels.append(labels)
                all_probs.append(probs)
                ii += 1
            preds, _, retcode = self.get_predictions_from_list(
                self.output_sup, words, k
            )
            if gotError and retcode == 0:
                self.eprint(
                    "Didn't get error. Make sure your compiled "
                    "binary kept the assert statements"
                )
                self.assertTrue(False)
            else:
                return
            preds = preds.split('\n')[:-1]
            self.assertEqual(len(preds), len(all_labels))
            for i in range(len(preds)):
                labels = preds[i].split()
                probs = np.array(list(map(float, labels[1::2])))
                labels = np.array(labels[::2])
                self.assertTrue(np.allclose(probs, all_probs[i], rtol=1e-04))
                self.assertTrue(np.array_equal(labels, all_labels[i]))

        _test(0, 0, 0, 0)
        _test(1, 0, 0, 0)
        _test(10, 0, 0, 0)
        _test(1, 1, 1, 0)
        _test(1, 1, 1, 1)
        _test(1, 2, 3, 0)
        _test(1, 2, 3, 1)
        _test(10, 1, 1, 1)
        _test(1, 1, 1, 0, add_vocab=10)
        _test(1, 1, 1, 1, add_vocab=10)
        _test(1, 2, 3, 0, add_vocab=10)
        _test(1, 2, 3, 1, add_vocab=10)
        reach = 10
        for _ in range(10):
            N = random.randint(0, reach)
            init = random.randint(0, reach)
            offset = random.randint(0, reach)
            k = random.randint(0, reach)
            _test(N, init, init + offset, k)
예제 #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from fastText import load_model

embed_file = '../model/wiki.en.bin'
ft_model = load_model(embed_file)
input_matrix = ft_model.get_input_matrix()
output_matrix = ft_model.get_output_matrix()

emb_mean, emb_std = input_matrix.mean(), input_matrix.std()
print(emb_mean)
print(emb_std)

emb_mean, emb_std = output_matrix.mean(), output_matrix.std()
print(emb_mean)
print(emb_std)


def load_crawl_embed_index(file_path):
    embed_index = {}
    with open(file_path) as f:
        for line in f.read().split("\n")[1:-1]:
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:-1], dtype='float32')
            embed_index[word] = coefs

    print('Found %s word vectors.' % len(embed_index))
    return embed_index
if initial_rows - rows_with_text_empty == len(df):
    print "Out of {} rows, {} rows were found with empty text field. They were dropped, and the number of rows now is {}".format(
        initial_rows, rows_with_text_empty, len(df))
else:
    print "Check again!"

df.drop('photourl', axis=1, inplace=True)

print list(df.columns.values)

df['ext_hashtags'] = df.apply(lambda row: extract_hashtags(row['text']),
                              axis=1)
print list(df.columns.values)

# Load language identification model
model = fastText.load_model('fastText_models/lid.176.bin')

# Get predictions for language identification
df['langid'] = df['text'].apply(lambda x: detect_lang(x))

#drop rows if langid was not successfull
df.dropna(subset=['langid'], inplace=True)

# Save DataFrame to disk
print 'Saving to a file'
df.to_csv('lang_instagram.tsv',
          sep='\t',
          quoting=csv.QUOTE_NONNUMERIC,
          encoding='latin-1',
          index=False)
예제 #13
0
def before_request():
    g.ft_model = fastText.load_model(app.config["FT_SERVER_MODEL_PATH"])
예제 #14
0
import fastText
import dropbox_helper

try:
    dropbox_helper.load('./ml/tags/model/tags_model_new', '/tags_model_new')
    model = fastText.load_model('ml/tags/model/tags_model_new')
    print('loaded last tags model')
except ValueError:
    model = fastText.load_model('ml/tags/model/tags_model')
    print('loaded default tags model')


def predict(text):

    labels, probs = model.predict([text], k=5)

    print(labels, probs)

    tags = []

    for i, label in enumerate(labels[0]):
        tags.append({
            'label': label.replace('__label__', ''),
            'probability': round(probs[0][i], 3)
        })

    return tags


# fast code for result
# def build_model() {
# Replace miscellaneous characters
train['comment_text'] = train['comment_text'].str.replace('ı', 'i')
test['comment_text'] = test['comment_text'].str.replace('ı', 'i')

# Normalize comment_text (IMPLEMENTED IN GENERATOR)
#train['comment_text'] = train['comment_text'].apply(normalize)
#test['comment_text'] = test['comment_text'].apply(normalize)

# Split comment_text (IMPLEMENTED IN GENERATOR)
#train['comment_text'] = train['comment_text'].str.split()
#test['comment_text'] = test['comment_text'].str.split()
"""
Loading FT model
"""
print('Loading FT model')
ft_model = load_model('/home/kazuki_onodera/wiki.en.bin')
# Embedding dimension
n_features = ft_model.get_dimension()
"""
Define models
"""


def build_model(logdir='.'):
    # Bidirectional-LSTM
    if logdir is not None and os.path.exists(logdir):
        tb_cb = TensorBoard(log_dir='.', histogram_freq=0, write_graph=True)

    inp = Input(shape=(window_length, 300))
    x = Bidirectional(
        LSTM(50, return_sequences=True, dropout=0.1,
예제 #16
0
    zero, diag_margin = Variable(zero), Variable(diag_margin)

    x = x / torch.norm(x, 2, 1, keepdim=True)
    v = v / torch.norm(v, 2, 1, keepdim=True)
    prod = torch.matmul(x, v.transpose(0, 1))
    diag = torch.diag(prod)
    for_x = torch.max(zero,
                      margin - torch.unsqueeze(diag, 1) + prod) - diag_margin
    for_v = torch.max(zero,
                      margin - torch.unsqueeze(diag, 0) + prod) - diag_margin
    return (torch.sum(for_x) + torch.sum(for_v)) / x.size(0)


if __name__ == '__main__':
    print('Loading a pretrained fastText model...')
    word_embedding = fasttext.load_model(args.fasttext_model)

    print('Loading a dataset...')
    train_data = ReedICML2016(
        args.img_root, args.caption_root, args.trainclasses_file,
        word_embedding, args.max_nwords,
        transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ]))

    word_embedding = None

    train_loader = data.DataLoader(train_data,
예제 #17
0
corpus_path = "senti_corpus.csv"
df_corpus = pd.read_csv(corpus_path, encoding="utf-8")
train_list = (df_corpus["label"] + " , " + df_corpus["seged_weibo"]).tolist()
# input must be a filepath
train_path = "train.txt"
model_path = "senti-model.bin"
with open(train_path, "w", encoding="utf_8_sig") as fw:
    for line in train_list:
        fw.write(u"{}\n".format(line))
# train
model_classifier = fastText.train_supervised(
    train_path,
    label="__label__",
    dim=200,
    lr=0.2,
    epoch=25,
    wordNgrams=2,
)
model_classifier.save_model(model_path)
"""test"""
test_path = "senti_test.csv"
df_test = pd.read_csv(test_path, encoding="utf-8")
test_list = (df_test["label"] + " , " + df_test["seged_weibo"]).tolist()
test_path2 = "test.txt"
with open(test_path2, "w", encoding="utf_8_sig") as fw:
    for line in test_list:
        fw.write(u"{}\n".format(line))
model_path = "senti-model.bin"
model_classifier = fastText.load_model(model_path)  # input must be a filepath
result = model_classifier.test(test_path2)
print(result[1])  # accuracy
예제 #18
0
class FastTextDataGenerator():
    # class variables
    print('loading FastText model...',
          flush=True)  # flush set true has no effect?
    ft_model_path = '/home/kai/data/resources/FastText/wiki.en.bin'
    ft_model = load_model(ft_model_path)
    n_features = ft_model.get_dimension()
    print('fasttext model loaded. embedding dimemsion: {}'.format(n_features))

    def __init__(self,
                 df,
                 label_cols,
                 text_column_name,
                 window_length,
                 batch_size,
                 shuffle=True):
        """
        Params:
            df: (dataframe) at least contains a text column and label columns
            label_cols: (list) names of label columns
                        e.g.: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
            text_column_name: (str) text for faxttext embedding
                        e.g.: comment or comment_text_cleaned
            window_length: (int) pick at most the first n words from a text
            batch_size: (int) how large to generate at each batch
            shuffle: (boolean)  whether to shuffle df each epoch 
        Returns:
            (tuple) contains training data and labels of the batch size
            
        """
        #self._ft_model = load_model(ft_model_path)
        #         self._n_features = FastTextDataGenerator.ft_model.get_dimension()
        #         print('fasttext model loaded. embedding dimemsion: {}'.format(self._n_features))

        self._df = df
        self._label_cols = label_cols
        self._text_column_name = text_column_name
        self._window_length = window_length
        self._batch_size = batch_size
        self._shuffle = shuffle
        self.training_steps_per_epoch = round(len(df) / batch_size)

    def load_new_ft_model(self, new_ft_model_path):
        print('loading new model...', flush=True)
        FastTextDataGenerator.ft_model = load_model(new_ft_model_path)
        FastTextDataGenerator.n_features = FastTextDataGenerator.ft_model.get_dimension(
        )
        print('fasttext model loaded. embedding dimemsion: {}'.format(
            FastTextDataGenerator.n_features))

    def data_gen(self):
        """
        Given a raw dataframe, generates infinite batches of FastText vectors.
        """
        batch_i = 0  # Counter inside the current batch vector
        batch_x = None  # The current batch's x data
        batch_y = None  # The current batch's y data

        while True:  # Loop forever
            if self._shuffle:
                self._df = self._df.sample(frac=1)

            for i, row in self._df.iterrows():
                comment = row[self._text_column_name][
                    0]  # add [0] to get the string from the pd.Series

                if batch_x is None:
                    batch_x = np.zeros((self._batch_size, self._window_length,
                                        FastTextDataGenerator.n_features),
                                       dtype='float32')
                    batch_y = np.zeros(
                        (self._batch_size, len(self._label_cols)),
                        dtype='float32')

                batch_x[batch_i] = FastTextDataGenerator.text_to_vector(
                    comment, self._window_length)
                batch_y[batch_i] = row[self._label_cols].values
                batch_i += 1

                if batch_i == self._batch_size:
                    # Ready to yield the batch
                    yield batch_x, batch_y
                    batch_x = None
                    batch_y = None
                    batch_i = 0

    @staticmethod
    def text_to_vector(text, window_length):
        """
        Given a string, normalizes it, then splits it into words and finally converts
        it to a sequence of word vectors.
        """
        text = FastTextDataGenerator.normalize(text)
        words = text.split()
        window = words[-window_length:]

        x = np.zeros((window_length, FastTextDataGenerator.n_features))

        for i, word in enumerate(window):
            x[i, :] = FastTextDataGenerator.ft_model.get_word_vector(
                word).astype('float32')

        return x

    @staticmethod
    def normalize(s):
        """
        Given a text, cleans and normalizes it. Feel free to add your own stuff.
        """
        #s = s.lower()
        # Replace ips
        #s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
        # Isolate punctuation
        #s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
        # Remove some special characters
        #s = re.sub(r'([\;\:\|•«\n])', ' ', s)
        # Replace numbers and symbols with language
        s = s.replace('&', ' and ')
        s = s.replace('@', ' at ')
        s = s.replace('0', ' zero ')
        s = s.replace('1', ' one ')
        s = s.replace('2', ' two ')
        s = s.replace('3', ' three ')
        s = s.replace('4', ' four ')
        s = s.replace('5', ' five ')
        s = s.replace('6', ' six ')
        s = s.replace('7', ' seven ')
        s = s.replace('8', ' eight ')
        s = s.replace('9', ' nine ')
        return s

    @staticmethod
    def df_to_data(df, text_column_name, window_length):
        """
        Convert a given dataframe to a dataset of inputs for the NN.
        """
        x = np.zeros(
            (len(df), window_length, FastTextDataGenerator.n_features),
            dtype='float32')

        for i, comment in enumerate(df[text_column_name].values):
            x[i, :] = FastTextDataGenerator.text_to_vector(
                comment, window_length)

        return x
예제 #19
0
    fin.close()

    return test_data, test_lable


# fact=argv[1]
if __name__ == '__main__':

    stop_words = read_stop_words('F:/FastTextpredict/stopwords.txt')
    #
    # print('train acc...')
    # classifier = ff.train_supervised(input="acc_train.txt", label="__label__" ,wordNgrams=2)
    # classifier.save_model('acc_train.model.bin')

    #load训练好的模型
    classifier = ff.load_model('F:/FastTextpredict/acc_train.model.bin')
    #测试模型

    #系统运行的代码
    # list =[]
    # list.append(fact)
    # list =cut_text(list,stop_words)
    # lable, pro = classifier.predict(list)
    # for i,text in enumerate(lable):
    #    print(text[9:]+"的概率为:"+str(pro[i]))

    #end here

    fact = '昌宁县人民检察院指控,2014年4月19日下午16时许,被告人段某驾拖车经过鸡飞乡澡塘街子,' \
           '时逢堵车,段某将车停在“冰凉一夏”冷饮店门口,被害人王某的侄子王2某示意段某靠边未果,' \
           '后上前敲打车门让段某离开,段某遂驾车离开,但对此心生怨愤。同年4月21日22时许,被告人' \
예제 #20
0
#!/usr/bin/env python
import re
import sys
import json
from waitress import serve
import fastText as fasttext
from flask import Flask, request, jsonify

model = fasttext.load_model('model.bin')
app = Flask(__name__)


def tokenize(line):
    line = re.sub(r'[,./<>?;:\"!@#$%^&*()=\[\]{}()]', ' ', line)
    line = re.sub(r'[ \t]{2,}', ' ', line).lower()
    line = re.sub(r'(.)\1\1+', r'\1\1\1', line)
    return line


def predict(line):
    line = tokenize(line)
    predict = model.predict(line)
    return {
        "labels": list(predict[0]),
        "scores": list(predict[1]),
    }


@app.route("/")
def handler():
    review = request.args.get('review')
예제 #21
0
 def __init__(self, model_path):
     self.model = load_model(model_path)
     input_matrix = self.model.get_input_matrix()
     input_matrix_shape = input_matrix.shape
     super().__init__(input_matrix_shape[0], input_matrix_shape[1])
     self.weight.data.copy_(torch.FloatTensor(input_matrix))
예제 #22
0
        offsets = Variable(torch.LongTensor(word_offsets))
        return super().forward(ind, offsets)


def random_word(N):
    return ''.join(
        random.choices(
            string.ascii_uppercase + string.ascii_lowercase + string.digits,
            k=N
        )
    )


if __name__ == "__main__":
    ft_emb = FastTextEmbeddingBag("fil9.bin")
    model = load_model("fil9.bin")
    num_lines = 200
    total_seconds = 0.0
    total_words = 0
    for _ in range(num_lines):
        words = [
            random_word(random.randint(1, 10))
            for _ in range(random.randint(15, 25))
        ]
        total_words += len(words)
        words_average_length = sum([len(word) for word in words]) / len(words)
        start = time.clock()
        words_emb = ft_emb(words)
        total_seconds += (time.clock() - start)
        for i in range(len(words)):
            word = words[i]
예제 #23
0
 def test_dimension(self):
     f = load_model(self.output + '.bin')
     f.get_dimension()
예제 #24
0
                    help='path to out file',
                    required=True)

parser.add_argument('--fasttext-model',
                    action='store',
                    dest='model',
                    help='.bin model of fasttext',
                    required=True)

args = parser.parse_args()

#
# load data & work
# -------------------------------
#
model = fastText.load_model(args.model)

ind_stats = []

with open(args.allen_vocab, "r", encoding="utf8") as in_file:
    with open(args.out_file, "w", encoding="utf8") as out_file:
        for line in tqdm(in_file):
            line = line.strip()

            _, indices = model.get_subwords(line)
            ind_stats.append(len(indices))

            out_file.write(line + " " +
                           " ".join(map(str, map(lambda x: x + 1, indices))) +
                           "\n")
예제 #25
0
def create_fasttext_embedding(tokens, bin_path):
    model = fastText.load_model(bin_path)
    emb = {tok: model.get_word_vector(tok) for tok in tokens}

    return emb
    return {}
예제 #26
0
    words = ' '.join(words)
    return words


# print(sentences_to_words(celebratity['Tom Cruise']))

celebratity_type = dict()
import fastText

model_name = 'model/' + 'classify_with_videos' + '.model'
type_set = [
    'ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP',
    'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ'
]

classifier = fastText.load_model(model_name)
count = 0
for name in celebratity.keys():
    count += 1
    words = sentences_to_words(celebratity[name])
    print(words)
    words = words.replace('\n', '')
    #    with open('words%d.txt'%count,'w') as f:
    #        f.write('i love you')
    results = classifier.predict(words, 16)
    print(name, results)
    type, proba = results
    real_type = type[0][-4:]
    print(real_type)
    type = [item[-4:] for item in type]
    proba = [item for item in proba]
from fastText import load_model
import sys
import os
import joblib

repos = sys.argv[1]
model_f = sys.argv[2]
model = load_model(model_f)

res = {}

for repo_voc in os.listdir(repos):
    full_path = os.path.join(repos, repo_voc)
    print(full_path)
    with open(full_path) as handle:
        sent = handle.read()
        v = model.get_sentence_vector(sent)
        res[repo_voc] = v

joblib.dump(res, 'repo_vectors')
예제 #28
0
def read_polyglot(path, word_tokenizer):
    with open(path, 'rb') as file:
        model = pickle.load(file, encoding='bytes')
    word_index = word_tokenizer.word_index
    embedding = np.zeros([len(word_index) + 1, len(list(model.values())[0])])
    for v, i in word_index.items():
        if v in model:
            embedding[i] = model[v]
    return embedding


if __name__ == '__main__':
    from data_utils.constants import WORD_VEC_PATH
    import pickle
    with open('output/word_tokenizer.pkl', 'rb') as file:
        word_tokenizer = pickle.load(file)
    print(word_tokenizer.word_index[word_tokenizer.oov_token])
    print(len(word_tokenizer.word_index))
    model = fastText.load_model(WORD_VEC_PATH)
    embedding = read_word_vec(WORD_VEC_PATH, word_tokenizer)
    assert (np.sum(embedding[word_tokenizer.word_index['nhà']] ==
                   model.get_word_vector('nhà')) == model.get_dimension())
    assert (np.sum(embedding[word_tokenizer.word_index['Bán']] ==
                   model.get_word_vector('Bán')) == model.get_dimension())
    assert (np.sum(embedding[word_tokenizer.word_index['tầng']] ==
                   model.get_word_vector('tầng')) == model.get_dimension())
    assert (np.sum(embedding[word_tokenizer.word_index['<UNK>']] ==
                   model.get_word_vector('<UNK>')) == model.get_dimension())
    print(embedding[word_tokenizer.word_index[word_tokenizer.oov_token]])
예제 #29
0
from gensim.models import KeyedVectors
import fastText as ft
import numpy as np
import os
import sys
import gc
import shutil

print("loading vocabulary")
embeddings = KeyedVectors.load("Norsk_embeddings")
print("loading out of vocabulary")
outofvocab = ft.load_model("norsk.bin")
print("loading data")
basepath = os.path.normpath(os.path.realpath(__file__))
while os.path.basename(basepath) != "Minerva":
    basepath = os.path.dirname(basepath)

directory = os.path.normpath(
    os.path.join(basepath, "data/clean/mftd_norwegian"))

newWords = {}
seen = set()
print("Loaded data")
for filename in os.listdir(directory):
    with open(directory + "/" + filename, 'r') as file:
        data = file.read()
    tokens = [token for token in data.split(" ") if token != ""]
    for token in tokens:
        if token in seen:
            continue
        seen.add(token)
예제 #30
0
for del_key in delete_key_words:
    del ast_word2word[del_key]

# Normalize comment_text (IMPLEMENTED IN GENERATOR)
#train['comment_text'] = train['comment_text'].apply(normalize)
#test['comment_text'] = test['comment_text'].apply(normalize)

# Split comment_text (IMPLEMENTED IN GENERATOR)
#train['comment_text'] = train['comment_text'].str.split()
#test['comment_text'] = test['comment_text'].str.split()

"""
Loading FT model
"""
print('Loading FT model')
ft_model = load_model('../external_data/pretrained/fasttext/wiki/wiki.en.bin')
# Embedding dimension
n_features = ft_model.get_dimension()


"""
Define models
"""

def build_lstm_stack_model(logdir='attention'):
    # Bidirectional-LSTM
    inp = Input(shape=(window_length, 300))
    inp_dr = SpatialDropout1D(0.05)(inp)
    l_lstm = Bidirectional(CuDNNGRU(512, return_sequences=True))(inp_dr)
    l_lstm = Dropout(0.05)(l_lstm)
    x_gmp = GlobalMaxPool1D()(l_lstm)
예제 #31
0
    def __call__(self,
                 data=None,
                 save_csv=False,
                 full=False,
                 verbose=True,
                 runKNN=False):
        """
        By calling this function the user will start the processing of the
        pipeline.

        If no data is provided to this function under the data parameter it
        will take the path provided in the config['data'] entry, load it
        and use it. It is useful as a fallback option, however it is expected
        that as part of the integration of Optimus into a pipeline, some
        data will be passed to this function call.

        Parameters
        ----------
        data : pd.core.series.Series
            a series object containing the strings that need to be processed
            (default=None)

        save_csv : bool
            this dictates if the full prowl will be saved as a csv
            (default=False)

        full : bool
            this dictates if the data returned to the user in the form of a
            full dataframe or just a series of predicted labels
            (default=False)

        verbose : bool
            this parameter dictates how much will be printed. if false only a
            few lines will be output.
            (default=True)

        runKNN : bool
            this parameter dictates if the K Nearest Neighbour algorythm will
            be applied to the labels that are not picked up in the normal run
            of optimus

        Returns
        -------
        pd.core.series.Series / pd.core.frame.DataFrame
            depending on the full setting this will return the output of the
            last depth or a full dataframe with outputs from each iteration

        """
        # set the verbosity setting
        self.verbose = verbose

        # notes
        self.vprint('-- Performing setup')
        self.vprint('_' * 79)

        # load config before each run
        self.config = self.load_config(self.config_path)

        # reformat provided series into accepted format
        data = self.catch_input(data)

        # build looping mechanism, adding 1 to the depth of
        # ratchet and changing the dataset passing through the classes

        # free text loading
        self.vprint("-- Loading descriptions")
        if data:
            self.vprint("-- Ingesting provided series")
            L = Loader(self.config, data)
        else:
            self.vprint("-- No custom data provided, using data from config")
            L = Loader(self.config)

        # start a dataframe that will track the labels at each level
        prowl = pd.DataFrame.from_dict(L.linked, orient='index')
        prowl = prowl.reset_index()
        prowl.columns = ['original', 'current_labels']

        # embed the words using fastText
        if hasattr(self, "matrix"):
            self.vprint("-- Model already loaded")

        else:
            self.vprint("-- Loading model")
            self.matrix = ft.load_model(self.config['model'])

        self.vprint("-- Embedding")
        clusterer = Clusterer(L, self.matrix, self.config)

        # clustering
        self.vprint("-- Clustering")
        CC = ClusterConstructor(clusterer, self.config)

        # start the loop for each depth
        self.vprint('_' * 79)  # some decoration
        while CC.iterate:

            self.vprint(f"-- Depth: {CC.distance}")  # some decoration
            self.vprint('_' * 79)  # some decoration

            # edit distance based metrics
            ED = EditDistance(CC, self.config)
            # push the rejected clusters back to the ClusterConstructor
            # for the next phase
            CC.clusters = ED.rejected
            self.vprint(
                f"    ** | Edit Distance   | classified: {len(ED.accepted)}")

            # class for character and word n-gram and scoring
            WG = WordGram(CC, self.config)
            # push the rejected clusters back to CC for the next phase
            CC.clusters = WG.rejected
            self.vprint(
                f"    ** | Word Grams      | classified: {len(WG.accepted)}")

            # class for character and word n-gram and scoring
            CG = CharGram(CC, self.config)
            # push the rejected clusters back to CC for the next phase
            CC.clusters = CG.rejected
            self.vprint(
                f"    ** | Character Grams | classified: {len(CG.accepted)}")

            # class for finding suitable hypernyms from WordNet
            HN = Hypernyms(CC, self.config)
            # push the rejected clusters back to CC for the next phase
            CC.clusters = HN.rejected
            self.vprint(
                f"    ** | Hyponyms        | classified: {len(HN.accepted)}")

            # gatekeeper, overwrites the CC with what it needs for the
            # next push
            H = Gatekeeper(CC, ED, WG, CG, HN, self.matrix, self.config, prowl)

            CC = H.clusterconstructor
            prowl = H.prowl

            self.vprint('_' * 79)

        # if requested run a KNN on the non_labeled data
        #if runKNN:
        #    self.vprint(f"-- Performing KNN")
        #    K = KNN(H, self.matrix)
        #    self.KNN_predictions = pd.DataFrame(K())
        #    self.KNN_predictions.to_csv('knn_results.csv')
        #    self.vprint(self.KNN_predictions)

        # clean up after yourself
        self.clean_up()

        # return output
        return self.handle_output(prowl, save_csv=save_csv, full=full)
예제 #32
0
import json

import fastText
model_file = './model/ft_train.bin'
number_of_labels = 3

with open('./model/class_label.json','r',encoding = 'utf-8') as f:
    class_label = json.load(f)
model = fastText.load_model(model_file)

def predict_field(text):
    res = []
    tmp = model.predict(text,number_of_labels)
    labels = tmp[0]
    prob = tmp[1]
    for i,label in enumerate(labels):
        # if prob[i] > 0.05:
        res.append(label.replace('__label__','').replace('_',' '))
    return res


def intersect(a, b):
    return list(set(a) & set(b))

def getListJD(text,field):
    if (field not in class_label.keys()):
        return []
    return class_label[field]

def retrieval_jd(text,fields):
    list_jd = []
import pandas as pd
from keras.layers import Bidirectional, CuDNNGRU, SpatialDropout1D, GlobalAveragePooling1D, concatenate, Dropout, Dense
from keras.models import Model
from fastText import load_model
import tensorflow as tf
import os
import keras.backend.tensorflow_backend as KTF

classes = ['requires_reply']

path = './'
training_filename = ''
test_filename = ''
fasttext_filename = 'cc.de.300.bin'

ft_model = load_model(path + fasttext_filename)
n_features = ft_model.get_dimension()
print('Dimensions ' + str(n_features))

train = pd.read_csv(path + training_filename)
test = pd.read_csv(path + test_filename)


def get_session(gpu_fraction=0.9):

    num_threads = os.environ.get('OMP_NUM_THREADS')
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)

    if num_threads:
        return tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options, intra_op_parallelism_threads=num_threads))
예제 #34
0
    X = (train["description"].fillna("") + " " +
         train["title"].fillna("")).values
    X_test = (test["description"].fillna("") + " " +
              test["title"].fillna("")).values
    y = pd.read_csv("../input/train.csv")["deal_probability"].values

    embed_size = EMBEDDING_SIZE
    maxlen_desc = 120
    maxlen_title = 21
    embedding_vectors = {}
    word_idx = {}
    embedding_vectors["UNKNOWN"] = np.zeros(embed_size)
    word_idx["UNKNOWN"] = len(word_idx)

    model = fastText.load_model(EMBEDDING_FILE)  #.replace(".vec", ".bin"))
    print("getting embedding vectors...")
    for text in tqdm(list(X) + list(X_test)):  # max_length, padding,
        for w in text.split():
            try:
                embedding_vectors[w] = model.get_word_vector(w)
                if w in word_idx:
                    continue
                else:
                    word_idx[w] = len(word_idx)
            except:
                pass
    idx_to_word = {v: k for k, v in word_idx.items()}
    del model

    train_test_index_list_1 = []
예제 #35
0
BIG_CATEGORY = 'mobile'
N_CLASSES = 27

if __name__ == "__main__":
    psychic_learners_dir = Path.cwd().parent
    valid_data = os.path.join(psychic_learners_dir, 'data',
                              f'{BIG_CATEGORY}_valid_split.csv')
    test_data = os.path.join(psychic_learners_dir, 'data',
                             f'{BIG_CATEGORY}_test_split.csv')
    ROOT_PROBA_FOLDER = os.path.join(psychic_learners_dir, 'data',
                                     'probabilities', BIG_CATEGORY,
                                     'extractions_fasttext')
    valid_data = pd.read_csv(valid_data)
    test_data = pd.read_csv(test_data)
    model = load_model(
        str(psychic_learners_dir / 'data' / 'fasttext_models' /
            f'{BIG_CATEGORY}_extractions_model.bin'))
    valid_preds = []
    test_preds = []
    for title in valid_data['extractions'].values:
        if title == '0':  # comment out if normal title
            valid_preds.append(np.zeros(N_CLASSES))
            continue
        title = ' '.join(literal_eval(title))  # comment out if normal title
        pred = model.predict(title, k=N_CLASSES)
        pred = sorted(zip(pred[0], pred[1]), key=lambda x: x[0])
        pred = [x[1] for x in pred]
        valid_preds.append(pred)

    for title in test_data['extractions'].values:
        if title == '0':  # comment out if normal title
예제 #36
0
import xml.etree.cElementTree as ET
from xml.dom import minidom
import sys
import json
import re
import os
import datetime
import fnmatch
from language_tagger import tag_language
from trending_videos_difference_server import difference
import fastText

# Load fastText language detection model
langdetect = fastText.load_model(
    '/disk/data/share/MTproject/fastText/langdetect.bin')

categories = {
    '1': 'Film & Animation',
    '2': 'Cars & Vehicles',
    '10': 'Music',
    '15': 'Pets & Animals',
    '17': 'Sport',
    '19': 'Travel & Events',
    '20': 'Gaming',
    '22': 'People & Blogs',
    '23': 'Comedy',
    '24': 'Entertainment',
    '25': 'News & Politics',
    '26': 'How-to & Style',
    '27': 'Education',
    '28': 'Science & Technology',
예제 #37
0
 def load_model(cls, model_fpath):
     clf = FasttextClassifier()
     clf.classifier_ = fasttext.load_model(model_fpath)
     print 'load model from %s success!' % (model_fpath)
     return clf
예제 #38
0
 def __init__(self):
     self.encoder = cPickle.loads(open("encoder2.7.p", "rb").read())
     self.classifier = load_model('models/fasttext.bin')
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        # y_pred = self.linear(lstm_out[-1].view(self.batch_size, -1))
        # print(lstm_out.size())
        # print(lstm_out.contiguous().view(-1, self.hidden_dim*2).size())
        y_pred = self.linear(lstm_out.contiguous().view(
            -1, self.hidden_dim * 2))
        y_pred = y_pred.view(batch_size, -1, self.output_dim)
        # y_pred = torch.nn.functional.softmax(y_pred)
        # y_pred = torch.nn.Softmax(dim=-1)(y_pred)
        # print(y_pred.size())
        # y_pred = nn.Softmax()(y_pred)
        return y_pred


print("loading embeddings...")
ft_hi = fastText.load_model(
    "/home1/zishan/WordEmbeddings/FastText/wiki.bn.bin")

print("loading dictionaries...")
class_index = json.load(open("../../Data/Crosslingual/class_index.json"))
word_index = pd.read_pickle(
    "../../Data/Crosslingual/universal_word_index.pickle")

print("loading training data...")
train = pd.read_pickle('../../Data/Crosslingual/Bengali_train.pickle')
print("loading testing data...")
test = pd.read_pickle('../../Data/Crosslingual/Bengali_test.pickle')
test_trig = np.asarray(test['trigger'].tolist())
test_trig = test_trig.reshape(len(test_trig), 75, 1)
#test_trig = to_categorical(test_trig,2)
test_sentences = np.asarray(test['sentences_token'].tolist())
예제 #40
0
def process_data(data):
    model = load_model("wiki.simple.bin")
    model_s = list()
    dic = {}
    back = {}
    t = 1
    zero = np.zeros(300)
    model_s.append(zero)
    model_s.append(zero)
    print(model_s[0])
    dic['$'] = 0 #end
    dic['&'] = 1 #none
    back[0] = '$'
    back[1] = '&'
    
    max_par_size = 0
    max_que_size = 0

    contexts_input = list()
    contexts_len = list()
    questions_input = list()
    questions_len = list()
    ans_start = list()
    ans_end = list()

    for i in tqdm(range(len(data['data']))):
        # проход по всем статьям википедии
        article = data['data'][i]

        # проход по всем параграфам в данной статье
        for parag in article['paragraphs']:
            # сюда будем пихать численное описание вектора
            context_vector = list()

            context = parag['context']
            t_context = tokenize(context)

            for i in range(len(t_context)):
                if t_context[i] in dic:
                    x = dic[t_context[i]]
                else:
                    t += 1
                    dic[t_context[i]] = t
                    back[t] = t_context[i]
                    # тут надо еще обыграть что может не быть такого вектора
                    # !!!
                    model_s.append(model.get_word_vector(t_context[i]))
                    x = t
                context_vector.append(x)

            context_size = len(context_vector)

            # проход по всем вопросам к этому парарграфу    
            for qn in parag['qas']:
                question = qn['question']
                t_question = tokenize(question)

                quest_vector = list()

                for j in range(len(t_question)):
                    if t_question[j] in dic:
                        x = dic[t_question[j]]
                    else:
                        t += 1
                        dic[t_question[j]] = t
                        back[t] = t_question[j]
                        # тут надо еще обыграть что может не быть такого вектора
                        # !!!
                        model_s.append(model.get_word_vector(t_question[j]))
                        x = t
                    quest_vector.append(x)

                quest_size = len(quest_vector)

                answer = qn['answers'][0]['text']
                start, end = find_span(context, answer)

                #Добавим в массив:

                contexts_input.append(context_vector)
                contexts_len.append(context_size)
                questions_input.append(quest_vector)
                questions_len.append(quest_size)
                ans_start.append(start)
                ans_end.append(end)

                max_que_size = max(quest_size, max_que_size)

            max_par_size = max(context_size, max_par_size)
            #paragraphs_ft.append(vec)
            
    return model_s, dic, back, max_par_size, max_que_size, contexts_input, contexts_len, questions_input, questions_len, ans_start, ans_end,
예제 #41
0
import fastText
from nltk import sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import json
import sys
import time
from nltk import ngrams
from nltk.stem import PorterStemmer
from nltk import sent_tokenize
from nltk import word_tokenize
ps = PorterStemmer()

model = fastText.load_model(
    '../../../Divers_Data_Maitrise/wiki.simple/wiki.simple.bin')
#model = fastText.load_model('../embeding_perso_fastText/data_embeding.bin')
#model = fastText.load_model('../embeding_perso_fastText/train_steam_embeding.bin')

path_data = '../../../Data_Maitrise/data/'
path_dest = '../../../Data_Maitrise/data_perso/'

time1 = time.time()

with_steming_param = False
k_best_sentences = 1
n = 2
similarity_type = 1  #1 : cosine, 2:dice


def get_best_sentence_no_repli(model: fastText,
                               list_sentence,
예제 #42
0
        help="Model to use",
    )
    parser.add_argument(
        "question_words",
        help="word questions similar to tmikolov's file (see help for link)",
    )
    parser.add_argument(
        "threshold",
        help="threshold used to limit number of words used",
    )
    args = parser.parse_args()
    args.threshold = int(args.threshold)

    # Retrieve list of normalized word vectors for the first words up
    # until the threshold count.
    f = load_model(args.model)
    # Gets words with associated frequeny sorted by default by descending order
    words, freq = f.get_words(include_freq=True)
    words = words[:args.threshold]
    vectors = np.zeros((len(words), f.get_dimension()), dtype=float)
    for i in range(len(words)):
        wv = f.get_word_vector(words[i])
        wv = wv / np.linalg.norm(wv)
        vectors[i] = wv

    total_correct = 0
    total_qs = 0
    total_num_lines = 0

    total_se_correct = 0
    total_se_qs = 0
예제 #43
0
def fastText_predict(text):
    model = fastText.load_model('lid.176.ftz')
    return model.predict(text)[0][0][-2:]