Exemplo n.º 1
0
    PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
    PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
    PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
    PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)

    with open(PATH_RAW) as dataset:
        data = pickle.load(dataset)

    # Decode data
    try:
        texts = [unicode(x) for x in data['texts']]
    except UnicodeDecodeError:
        texts = [x.decode('utf-8') for x in data['texts']]

    wg = WordGenerator(texts)
    vb = VocabBuilder(wg)
    vb.count_all_words()

    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(tokenize(t)) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))

    # Extract labels
    labels = [x['label'] for x in data['info']]

    convert_dataset(PATH_OWN, 50000, {})
    convert_dataset(PATH_OUR, 0, vocab)
    convert_dataset(PATH_COMBINED, 10000, vocab)
Exemplo n.º 2
0
""" Creates a vocabulary from a tsv file.
"""

import codecs
import example_helper
from deepmoji.create_vocab import VocabBuilder
from deepmoji.word_generator import TweetWordGenerator

with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU', 'utf-8') as stream:
    wg = TweetWordGenerator(stream)
    vb = VocabBuilder(wg)
    vb.count_all_words()
    vb.save_vocab()
Exemplo n.º 3
0
Extend the given vocabulary using dataset-specific words.

1. First create a vocabulary for the specific dataset.
2. Find all words not in our vocabulary, but in the dataset vocabulary.
3. Take top X (default=1000) of these words and add them to the vocabulary.
4. Save this combined vocabulary and embedding matrix, which can now be used.
"""

from __future__ import print_function

import json

from deepmoji.create_vocab import extend_vocab, VocabBuilder
from deepmoji.word_generator import WordGenerator

new_words = [u'#zzzzaaazzz', u'newword', u'newword']
word_gen = WordGenerator(new_words)
vb = VocabBuilder(word_gen)
vb.count_all_words()

with open('../model/vocabulary.json') as f:
    vocab = json.load(f)

print(len(vocab))
print(vb.word_counts)
extend_vocab(vocab, vb, max_tokens=1)

# 'newword' should be added because it's more frequent in the given vocab
print(vocab[u'newword'])
print(len(vocab))
Exemplo n.º 4
0
    PATH_RAW = '{}/{}/{}'.format(DIR, dset, FILENAME_RAW)
    PATH_OWN = '{}/{}/{}'.format(DIR, dset, FILENAME_OWN)
    PATH_OUR = '{}/{}/{}'.format(DIR, dset, FILENAME_OUR)
    PATH_COMBINED = '{}/{}/{}'.format(DIR, dset, FILENAME_COMBINED)

    with open(PATH_RAW) as dataset:
        data = pickle.load(dataset)
    # 
    # # Decode data
    # try:
    #     texts = [unicode(x) for x in data['texts']]
    # except UnicodeDecodeError:
    #     texts = [x.decode('utf-8') for x in data['texts']]

    wg = WordGenerator(texts)
    vb = VocabBuilder(wg)
    vb.count_all_words()

    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(tokenize(t)) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))

    # Extract labels
    labels = [x['label'] for x in data['info']]

    convert_dataset(PATH_OWN, 50000, {})
    convert_dataset(PATH_OUR, 0, vocab)
    convert_dataset(PATH_COMBINED, 10000, vocab)
""" Creates a vocabulary from a tsv file.
"""

import codecs
import example_helper
from deepmoji.create_vocab import VocabBuilder
from deepmoji.word_generator import TweetWordGenerator

with codecs.open('../../twitterdata/tweets.2016-09-01', 'rU',
                 'utf-8') as stream:
    wg = TweetWordGenerator(stream)
    vb = VocabBuilder(wg)
    vb.count_all_words()
    vb.save_vocab()
Exemplo n.º 6
0
    def split_train_val_test(self,
                             sentences,
                             info_dicts,
                             split_parameter=[0.7, 0.1, 0.2],
                             extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(
                    filter(
                        lambda i: isinstance(i, numbers.Number) and i < len(
                            sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(
                ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train,
                                                  test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]

        return result, result_infos, added