示例#1
0
def onehot_char_training_data(lang='deu', n=700, data_paths=()):
    df = get_data(lang)
    n = int(len(df) * n) if n <= 1 else n
    df = df.iloc[:n]
    input_texts, target_texts = [], []  # <1>
    input_vocabulary = set()  # <3>
    output_vocabulary = set()
    start_token, stop_token = '\t\n'  # <2>
    n = len(df)

    for input_text, target_text in tqdm(zip(df.eng, df[lang]), total=n):
        target_text = start_token + target_text \
            + stop_token  # <7>
        input_texts.append(input_text)
        target_texts.append(target_text)
        for char in input_text:  # <8>
            if char not in input_vocabulary:
                input_vocabulary.add(char)
        for char in target_text:
            if char not in output_vocabulary:
                output_vocabulary.add(char)

    input_vocabulary = sorted(input_vocabulary)  # <1>
    output_vocabulary = sorted(output_vocabulary)

    input_vocab_size = len(input_vocabulary)  # <2>
    output_vocab_size = len(output_vocabulary)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])  # <3>
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    input_token_index = dict([
        (char, i) for i, char in enumerate(input_vocabulary)
    ])  # <4>
    target_token_index = dict([(char, i)
                               for i, char in enumerate(output_vocabulary)])

    encoder_input_data = np.zeros(
        (n, max_encoder_seq_length, input_vocab_size), dtype='float32')  # <2>
    decoder_input_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    decoder_target_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    for i, (input_text, target_text) in enumerate(
            tqdm(zip(input_texts, target_texts),
                 total=len(target_texts))):  # <3>
        for t, char in enumerate(input_text):  # <4>
            encoder_input_data[i, t, input_token_index[char]] = 1.  # <5>
        for t, char in enumerate(target_text):  # <6>
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1

    trainset = (encoder_input_data, decoder_input_data, decoder_target_data)
    for i, p in enumerate(data_paths):
        np.save(p, trainset[i][:n], allow_pickle=False)

    return encoder_input_data, decoder_input_data, decoder_target_data
示例#2
0
def load_data(data_dir=None):
    data_dir = data_dir or os.path.join(os.path.sep + 'midata', 'viddesc')
    descriptions = pd.read_table(os.path.join(data_dir,
                                              'LSMDC16_annos_training.csv'),
                                 header=None)
    descriptions.columns = 'filename start_2s end_2s start end description'.split(
    )
    embeddings = load_embeddings(os.path.join(data_dir, 'embeddings'))
    wv = get_data('word2vec')
    return wv, descriptions, embeddings
示例#3
0
 def load_dialog(self, name='movie_dialog'):
     if name == 'dsfaq':
         db = load_faq()
     else:
         db = get_data(name)
     log.info(f'Loaded {len(db)} {self.name} statement-reply pairs.')
     if self.limit <= len(db):
         log.info(
             f'Limiting {self.name} database to {self.limit} statement-reply pairs.'
         )
         db = db.iloc[:self.limit]
     db = dict(zip(db[db.columns[0]], db[db.columns[1]]))
     return db
示例#4
0
def get_word_vectors(vocab):
    """ Create a word2vec embedding matrix for all the words in the vocab """
    wv = get_data('word2vec')
    vectors = np.array(len(vocab), len(wv['the']))
    for i, tok in enumerate(vocab):
        word = tok[0]
        variations = (word, word.lower(), word.lower()[:-1])
        for w in variations:
            if w in wv:
                vectors[i, :] = wv[w]
        if not np.sum(np.abs(vectors[i])):
            logger.warning('Unable to find {}, {}, or {} in word2vec.'.format(*variations))
    return vectors
def get_anki_phrases_english(limit=None):
    """ Return all the English phrases in the Anki translation flashcards

    >>> len(get_anki_phrases_english(limit=100)) > 700
    True
    """
    texts = set()
    for lang in ANKI_LANGUAGES:
        df = get_data(lang)
        phrases = df.eng.str.strip().values
        texts = texts.union(set(phrases))
        if limit and len(texts) >= limit:
            break
    return sorted(texts)
示例#6
0
def get_anki_phrases(lang='english', limit=None):
    """ Retrieve as many anki paired-statement corpora as you can for the requested language

    If `ankis` (requested languages) is more than one, then get the english texts associated with those languages.

    TODO: improve modularity: def function that takes a single language and call it recursively if necessary
    >>> get_anki_phrases('afr')[:2]
    ["'n Groen piesang is nie ryp genoeg om te eet nie.",
     "'n Hond het agter die kat aan gehardloop."]
    """
    lang = lang.strip().lower()[:3]
    lang = LANG2ANKI[lang[:2]] if lang not in ANKI_LANGUAGES else lang
    if lang[:2] == 'en':
        return get_anki_phrases_english(limit=limit)
    return sorted(get_data(lang).iloc[:, -1].str.strip().values)
示例#7
0
import os
import re
import sys

import pandas as pd
import numpy as np
from tqdm import tqdm
from nlpia.loaders import get_data

if len(sys.argv) > 1:
    lang = sys.argv[1][:3].lower()
else:
    lang = 'spa'


df = get_data(lang)
if lang not in df.columns:
    # print(df.columns)
    print(f"changing language name {lang} to {list(df.columns)[-1]}")
    lang = list(df.columns)[-1]

input_texts, target_texts = [], []  # <1>
input_vocabulary = set()  # <3>
output_vocabulary = set()
start_token, stop_token = '\t\n'  # <2>
n = int(len(df) * .1)
encoder_input_path = 'encoder_input_data-{}-{}.np'.format(lang, n)
decoder_input_path = 'decoder_input_data-{}-{}.np'.format(lang, n)
decoder_target_path = 'decoder_target_data-eng-{}.np'.format(n)

示例#8
0
@author: tonymullen
"""

import numpy as np
import shared

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

from nlpia.loaders import get_data

np.random.seed(1337)

word_vectors = get_data('w2v', limit=200000)
data_file_root = '/Users/tonymullen/Dropbox/Northeastern/Classes/NLP/Datasets'
# https://ai.stanford.edu/~amaas/data/sentiment/

number_of_files = 5000

dataset = shared.pre_process_data(data_file_root + '/aclimdb/train',
                                  number_of_files)
# dataset = shared.pre_process_data(data_file_root + '/miniImdb/train')

vectorized_data = shared.tokenize_and_vectorize(dataset, word_vectors)
expected = shared.collect_expected(dataset)
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
示例#9
0
""" Example python snippets and listing in [Chapter 6](http://bit.ly/ghnlpia)"""
# import pandas as pd
import numpy as np
from nlpia.loaders import get_data
from sklearn.decomposition import PCA

wv = get_data('word2vec')
"""
>>> from nlpia.loaders import get_data
>>> wv = get_data('word2vec')

>>> naive_vector = wv['woman'] + wv['Europe'] + wv[physics'] +\
...     wv['scientist']

>>> naive_vector
array([ 0.87109375, -0.08511353,  0.7817383 ,  0.25634766, -0.10058594,
        ...
        0.20800781,  0.06420898,  0.09033203,  0.8261719 , -0.2545166 ],
      dtype=float32)
>>> wv.similar_by_vector(naive_vector)
[('scientist', 0.7510349750518799),
 ('physicist', 0.7328184843063354),
 ('physics', 0.7083248496055603),
 ('theoretical_physicist', 0.6038039922714233),
 ('astrophysicist', 0.6009320020675659),
 ('mathematician', 0.5989038944244385),
 ('particle_physicist', 0.5962826013565063),
 ('Physicist', 0.5940043926239014),
 ('biochemist', 0.5833224058151245),
 ('physicists', 0.577854573726654)]
>>> for input_text, target_text in zip(df.statement, df.reply):
...     target_text = start_token + target_text \
...         + stop_token  # <5>
...     input_texts.append(input_text)
...     target_texts.append(target_text)
...     for char in input_text:  # <6>
...         if char not in input_vocabulary:
...             input_vocabulary.add(char)
...     for char in target_text:
...         if char not in output_vocabulary:
...             output_vocabulary.add(char)
"""
import os
from nlpia.loaders import get_data, DATA_PATH

df = get_data(os.path.join(DATA_PATH, '..', 'book', 'data', 'dialog.txt'))
df.columns = 'statement reply'.split()
df = df.fillna(' ')
input_texts, target_texts = [], []  # <1>
input_vocabulary = set()  # <2>
output_vocabulary = set()
start_token = '\t'  # <3>
stop_token = '\n'
max_training_samples = min(25000, len(df) - 1)  # <4>

for input_text, target_text in zip(df.statement, df.reply):
    target_text = start_token + target_text \
        + stop_token  # <5>
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:  # <6>
示例#11
0
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer
from nlpia.loaders import get_data
word_vectors = get_data('wv')

def pre_process_data(filepath):
...     """
...     Load pos and neg examples from separate dirs then shuffle them
...     together.
...     """
...     positive_path = os.path.join(filepath, 'pos')
...     negative_path = os.path.join(filepath, 'neg')
...     pos_label = 1
...     neg_label = 0
...     dataset = []
...     for filename in glob.glob(os.path.join(positive_path, '*.txt')):
...         with open(filename, 'r') as f:
...             dataset.append((pos_label, f.read()))
...     for filename in glob.glob(os.path.join(negative_path, '*.txt')):
...         with open(filename, 'r') as f:
...             dataset.append((neg_label, f.read()))
...     shuffle(dataset)
...     return dataset


>>> def tokenize_and_vectorize(dataset):
...     tokenizer = TreebankWordTokenizer()
...     vectorized_data = []
...     for sample in dataset:
>>> for reply in df.reply:
...     output_vocab.update(set(reply))
>>> input_vocab = tuple(sorted(input_vocab))
>>> output_vocab = tuple(sorted(output_vocab))
>>> input_vocabulary = tuple(sorted(input_vocab)) 
>>> output_vocabulary = tuple(sorted(output_vocab))

>>> max_encoder_seq_len = df.statement.str.len().max()  # <3>
>>> max_decoder_seq_len = df.target.str.len().max()
>>> max_encoder_seq_len, max_decoder_seq_len
(100, 102)
"""
import os
from nlpia.loaders import get_data

df = get_data('moviedialog')
df.columns = 'statement reply'.split()
df = df.dropna()
input_texts, target_texts = [], []  # <1>
start_token, stop_token = '\t\n'  # <3>
input_vocab = set()  # <2>
output_vocab = set(start_token + stop_token)
n_samples = min(100000, len(df))  # <4>

df['target'] = start_token + df.reply + stop_token
[input_vocab.update(set(statement)) for statement in df.statement]
[output_vocab.update(set(reply)) for reply in df.reply]
input_vocab = tuple(sorted(input_vocab))  #<6>
output_vocab = tuple(sorted(output_vocab))

max_encoder_seq_len = df.statement.str.len().max()
# script adopted from https://gist.github.com/lampts/026a4d6400b1efac9a13a3296f16e655

import gensim
import numpy as np
import tensorflow as tf
from nlpia.loaders import get_data
from tensorflow.contrib.tensorboard.plugins import projector

words = ('Sacramento', 'California', 'Oregon', 'Salem', 'Washington',
         'Olympia')

# loading your gensim
# model = gensim.models.KeyedVectors.load_word2vec_format('~/Downloads/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

model = get_data('w2v', limit=200000)  # <1>

# project part of vocab, 10K of 300 dimension
w2v_10K = np.zeros((6, 300))
with open("/Users/hannes/Downloads/prefix_metadata.tsv",
          'w+') as file_metadata:
    # for i, word in enumerate(model.index2word[:200000]):
    #     w2v_10K[i] = model[word]
    #     file_metadata.write(word.encode('utf-8') + '\n')
    for i, word in enumerate(list(words)):
        w2v_10K[i] = model[word]
        file_metadata.write(word.encode('utf-8') + '\n')

# define the model without training
sess = tf.InteractiveSession()

with tf.device("/cpu:0"):
示例#14
0
from __future__ import print_function, unicode_literals, division, absolute_import
from builtins import (bytes, dict, int, list, object, range, str,  # noqa
    ascii, chr, hex, input, next, oct, open, pow, round, super, filter, map, zip)
from future import standard_library
standard_library.install_aliases()  # noqa: Counter, OrderedDict,

import os
from itertools import product

import pandas as pd

from nlpia.constants import DATA_PATH
from nlpia.loaders import get_data


wisconsin = get_data('sentiment-word-ratings-sensori-motor-wisconsin.csv.gz')
warriner = get_data('sentiment-word-ratings-warriner.csv.gz')


def get_sentiment_sensori_motor():
    df = pd.read_html('http://www.neuro.mcw.edu/ratings/')[0]
    df.columns = ['word'] + [i.lower() + "_" + j.lower() for i, j in product(df.iloc[0][1:6], df.iloc[1][1:3])]
    df = df.iloc[2:]
    df.to_csv(os.path.join(DATA_PATH, 'sentiment-word-ratings-sensori-motor-wisconsin.csv.gz'), compression='gzip')
    return df


def get_sentiment_warriner():
    senti = pd.read_csv('http://crr.ugent.be/papers/Ratings_Warriner_et_al.csv', index_col='Word', header=0)
    senti.columns = [c.replace('.', '_') for c in senti.columns]
    del senti['Unnamed: 0']
示例#15
0
def wordvector_training_data(lang='deu', n=700, data_paths=()):
    df = get_data(lang)
    n = int(len(df) * n) if n <= 1 else n
    n = min(len(df), n)
    df = df.iloc[:n]
    input_texts, target_texts = [], []  # <1>
    input_vocabulary = set()  # <3>
    output_vocabulary = set()
    start_token, stop_token = '<START>', '<STOP>'
    input_tokenizer, output_tokenizer = Tokenizer(), Tokenizer()
    wv = get_data('word2vec')
    EMBEDDING_DIM = len(wv['the'])

    for input_text, target_text in tqdm(zip(df.eng, df[lang]), total=n):
        target_text = start_token + target_text + stop_token
        input_texts.append(input_text)
        target_texts.append(target_text)

    # texts = input_texts + target_texts
    # assert(len(texts) == n * 2)
    # input_texts = texts[:n]
    # target_texts = texts[n:]

    input_tokenizer.fit_on_texts(input_texts)
    output_tokenizer.fit_on_texts(target_texts)
    input_sequences = input_tokenizer.texts_to_sequences(input_texts)
    target_sequences = output_tokenizer.texts_to_sequences(target_texts)
    input_sequences = pad_sequences(input_sequences,
                                    maxlen=MAX_INPUT_SEQUENCE_LENGTH)
    target_sequences = pad_sequences(target_sequences,
                                     maxlen=MAX_TARGET_SEQUENCE_LENGTH)

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
    for w, i in input_tokenizer.word_index.items():
        if w in wv.vocab:
            embedding_matrix[i] = wv.word_vec(w)
    print('Null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix != 0, axis=1) == 0))

    #     input_vocabulary = sorted(input_vocabulary)  # <1>
    #     output_vocabulary = sorted(output_vocabulary)

    input_vocab_size = len(input_vocabulary)  # <2>
    output_vocab_size = len(output_vocabulary)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])  # <3>
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    input_token_index = dict([
        (char, i) for i, char in enumerate(input_vocabulary)
    ])  # <4>
    target_token_index = dict([(char, i)
                               for i, char in enumerate(output_vocabulary)])

    encoder_input_data = np.zeros(
        (n, max_encoder_seq_length, input_vocab_size), dtype='float32')  # <2>
    decoder_input_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    decoder_target_data = np.zeros(
        (n, max_decoder_seq_length, output_vocab_size), dtype='float32')
    for i, (input_text, target_text) in enumerate(
            tqdm(zip(input_texts, target_texts),
                 total=len(target_texts))):  # <3>
        for t, char in enumerate(input_text):  # <4>
            encoder_input_data[i, t, input_token_index[char]] = 1.  # <5>
        for t, char in enumerate(target_text):  # <6>
            decoder_input_data[i, t, target_token_index[char]] = 1.
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1

    trainset = (encoder_input_data, decoder_input_data, decoder_target_data)
    for i, p in enumerate(data_paths):
        np.save(p, trainset[i][:n], allow_pickle=False)

    return encoder_input_data, decoder_input_data, decoder_target_data