Пример #1
0
def analyze(texts):
    """Analyses the emotional content of the texts given."""

    encoding_to_label = {
        0: 'anger',
        1: 'disgust',
        2: 'fear',
        3: 'joy',
        4: 'sadness',
        5: 'surprise',
    }
    word_index, seqs = tokenize(texts, MAX_NB_WORDS)
    padded = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
    predictions = model.predict(padded, batch_size=128, verbose=1)

    analyses = []
    for pred in predictions:
        analysis = {}
        for i, prob in enumerate(pred):
            label = encoding_to_label[i]
            percentage = round(prob * 100, 3)
            analysis[label] = percentage

        analyses.append(analysis)

    return analyses
Пример #2
0
    def preprocess_raw(self):
        ''' Tokenize/bpe encode the raw text '''
        def is_xml(filename):
            ''' Determine if a file is XML formatted '''
            return filename.endswith('.sgm') or filename.endswith('.xml')

        def filter_lines(in_file, basename):
            ''' Scan the file for any filtered lines '''
            filtered = set()
            xml = is_xml(basename)
            for i, line in enumerate(in_file):
                if not self.preprocess_raw_line(line, xml=xml):
                    filtered.add(i)

            return filtered

        def merge(basename, in_file, out_file, filtered=None):
            ''' Tokenize the passed in file and write it to the designated file '''
            filtered = filtered or set()
            xml = is_xml(basename)
            for i, line in enumerate(in_file):
                if i in filtered:
                    continue

                processed_line = self.preprocess_raw_line(line, xml=xml)
                out_file.write(processed_line + '\n')

        # First, clean-up any incomplete preprocessing files
        for path in glob.glob(
                os.path.join(self.preprocess_directory, '*.incomplete')):
            os.remove(os.path.join(self.preprocess_directory, path))

        bpe_code_path = os.path.join(self.preprocess_directory, 'bpe.32000')
        if not os.path.exists(bpe_code_path):
            for split, file_pairs in type(self).RAW_SPLITS.items():
                for pair in file_pairs:
                    # First determine which lines must be skipped in both files, since the files are
                    # a parallel corpora.
                    filtered = set()
                    for filename, lang in zip(pair, type(self).LANGUAGE_PAIR):
                        in_path = os.path.join(self.preprocess_directory,
                                               filename)
                        with ExitStack() as stack:
                            in_file = stack.enter_context(Open(in_path, 'rt'))
                            filtered.update(
                                filter_lines(in_file,
                                             os.path.basename(filename)))

                    for filename, lang in zip(pair, type(self).LANGUAGE_PAIR):
                        basename = os.path.basename(filename)
                        in_path = os.path.join(self.preprocess_directory,
                                               filename)
                        split_path = os.path.join(self.preprocess_directory,
                                                  f'{split}.{lang}')

                        if os.path.exists(split_path):
                            continue

                        with ExitStack() as stack:
                            out_path = f'{split_path}.incomplete'
                            in_file = stack.enter_context(Open(in_path, 'rt'))
                            out_file = stack.enter_context(Open(
                                out_path, 'at'))

                            merge(basename, in_file, out_file, filtered)

            word_counts = Counter()
            for split in type(self).RAW_SPLITS:
                for lang in type(self).LANGUAGE_PAIR:
                    try:
                        split_path = os.path.join(self.preprocess_directory,
                                                  f'{split}.{lang}')
                        os.rename(f'{split_path}.incomplete', split_path)
                    except FileNotFoundError:
                        # This can happen if the preprocessing is interrupted
                        pass

                    tokenized_path = os.path.join(self.preprocess_directory,
                                                  f'{split}.tok.{lang}')
                    word_counts.update(
                        preprocess.tokenize(split_path, tokenized_path,
                                            self.preprocess_buffer_size))

            print('Learning BPE')
            preprocess.learn_bpe(bpe_code_path, word_counts.items())

        vocab_path = os.path.join(self.preprocess_directory, 'vocab.bpe.32000')
        if not os.path.exists(vocab_path):
            vocab = set()
            for split in type(self).RAW_SPLITS:
                for lang in type(self).LANGUAGE_PAIR:
                    in_path = os.path.join(self.preprocess_directory,
                                           f'{split}.tok.{lang}')
                    bpe_path = os.path.join(self.preprocess_directory,
                                            f'{split}.tok.bpe.32000.{lang}')

                    vocab.update(
                        preprocess.apply_bpe(bpe_code_path, in_path, bpe_path,
                                             self.preprocess_buffer_size))

            vocab_path = os.path.join(self.preprocess_directory,
                                      'vocab.bpe.32000')
            incomplete_vocab_path = f'{vocab_path}.incomplete'
            with Open(incomplete_vocab_path, 'wt') as vocab_file:
                vocab_file.writelines('\n'.join(
                    [word for word in sorted(vocab)]))
            os.rename(incomplete_vocab_path, vocab_path)
Пример #3
0
from keras.layers import Embedding, Dense, Dropout, Activation, LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

from data.preprocess import clean, tokenize, remove_common_words, undersample

EMBEDDINGS_DIM = 25
MAX_SEQUENCE_LENGTH = 280  # Max tweet length
MAX_NB_WORDS = 30000

df = clean('./Jan9-2012-tweets-clean.txt')
df = undersample(df, 'joy', 3000)

X = df['tweet']
y = df['emotion']

word_index, sequences = tokenize(X, MAX_NB_WORDS)
word_index, sequences = remove_common_words(word_index, sequences, 100)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

label_encoder = LabelEncoder()
label_encoder.fit(y)

print('Labels:', label_encoder.classes_)
print('Labels encodings:', label_encoder.transform(label_encoder.classes_))

labels = to_categorical(label_encoder.transform(y))

print("Shape of data: {}.".format(data.shape))

embeddings_index = {}
with open('./glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(