示例#1
0
def preprocess(data_dict,
               src_lang,
               trg_lang,
               min_freq=0,
               short_list=0,
               max_sent_len=-1):
    src_text = []
    trg_text = []

    src_tokenizer = mtok.MosesTokenizer(src_lang)
    trg_tokenizer = mtok.MosesTokenizer(trg_lang)

    # tokenize data
    for doc in data_dict:
        for i, t in enumerate(data_dict[doc]["source"]):
            data_dict[doc]["source"][i] = ' '.join(src_tokenizer(t))
            src_text.append(data_dict[doc]["source"][i])
        for i, t in enumerate(data_dict[doc]["target"]):
            data_dict[doc]["target"][i] = ' '.join(trg_tokenizer(t))
            trg_text.append(data_dict[doc]["target"][i])
    src_tokenizer.close()
    trg_tokenizer.close()
    # train subword model
    subword_model, subword_src_vocab, subword_trg_vocab, src_text, trg_text = train_subword_model(
        src_text, trg_text, nb_symbols=short_list)

    # Re-create the vocabulary with the nmtpy format
    src_vocab = create_vocab(src_text, min_freq, short_list)
    trg_vocab = create_vocab(trg_text, min_freq, short_list)

    #NOTE: put back processed text into data dict
    for doc in data_dict:
        nbsent = len(data_dict[doc]["source"])
        data_dict[doc]['source'] = []
        data_dict[doc]['target'] = []
        for i in range(nbsent):
            src = src_text.pop(0)
            trg = trg_text.pop(0)
            if len(src.split(' ')) <= max_sent_len and len(
                    trg.split(' ')) <= max_sent_len:
                data_dict[doc]['source'].append(src)
                data_dict[doc]['target'].append(trg)

        if len(data_dict[doc]['source']) == 0:
            print(
                "Document {} contains only lengthy sentences, I'm removing it..."
                .format(doc))
            del data_dict[doc]

    assert len(
        src_text) == 0, "preprocess: something is wrong with source text"
    assert len(
        trg_text) == 0, "preprocess: something is wrong with target text"

    return data_dict, src_vocab, trg_vocab, subword_model, subword_src_vocab, subword_trg_vocab
示例#2
0
 def __init__(self, lang):
     try:
         self._moses_tokenizer = mosestokenizer.MosesTokenizer(lang)
     except NameError as err:
         logger.error(
             "Install mosestokenizer to support moses tokenization")
         raise err
示例#3
0
    def __init__(self, lang: str, bpe_model=None):
        self.lang = lang
        self.word_tokenize = mosestokenizer.MosesTokenizer(self.lang)
        self.word_detokenize = mosestokenizer.MosesDetokenizer(self.lang)
        self.splitsents = mosestokenizer.MosesSentenceSplitter(self.lang)
        self.normalize = mosestokenizer.MosesPunctuationNormalizer(self.lang)

        if bpe_model:
            codes = codecs.open(bpe_model)
示例#4
0
def tokenize(text, language, join=True):
    """
    Tokenize text with mosestokenizer 
    """
    tokenize = mosestokenizer.MosesTokenizer(language)
    tokens = tokenize(text)
    tokenize.close()
    if join:
        tokens = ' '.join(tokens)
    return tokens
示例#5
0
def tokenize(text, language, join=True):
    """
    Tokenize text with mosestokenizer 
    """
    if not language in tokenizers:
        tokenizers[language] = mosestokenizer.MosesTokenizer(language)
    tokenizer = tokenizers[language]
    tokens = tokenizer(text)
    # tokenize.close()
    if join:
        tokens = ' '.join(tokens)
    return tokens
示例#6
0
    def __init__(self, config):
        self.SOS_token = config["SOS_token"]
        self.EOS_token = config["EOS_token"]
        self.UNK_token = config["UNK_token"]
        self.mask_token = config["mask_token"]
        self.max_features = config["en_voc"]

        self.word2index = {"SOS": self.SOS_token,
                           "EOS": self.EOS_token,
                           "UNK": self.UNK_token,
                           "MASK": self.mask_token
                           }
        self.index2word = {self.SOS_token: "SOS",
                           self.EOS_token: "EOS",
                           self.UNK_token: "UNK",
                           self.mask_token: "MASK"}
        self.word2count = Counter()
        self.added = set()
        self.n_words = 4
        self.tokenizer = mos.MosesTokenizer("en")
from bs4 import BeautifulSoup
import mosestokenizer
import os

outdir = 'processed_target_extraction_dedup_noleak'

# Ensure the datasets appear in chronological order.
# Repeated sentences with different labelling would be taken from newer datasets during deduplication.
datasets = {'train_valid': ['ABSA-15_Restaurants_Train_Final.xml', 'ABSA16_Restaurants_Train_SB1_v2.xml', 'Restaurants_Train_v2.xml'],
            'test': ['ABSA15_Restaurants_Test.xml', 'Restaurants_Test_Data_phaseB.xml']}

sentences = []
labels = []
tokenize = mosestokenizer.MosesTokenizer('en')

def processSentence(sentence):
    text = sentence.text.strip('\n')
    opinions = sentence.findAll('opinion')
    target_str = 'target'
    if not opinions:
        opinions = sentence.findAll('aspectterm')
        # Opinions are called aspectTerms in this file, and targets are called 'terms
        target_str = 'term'

    targets = [] #(target_str, beg, end)
    for op in opinions:

        if op[target_str] == 'NULL':
            continue
        else:
            beg = int(op['from']) # text.find(op['target'])
from torchtext import data, datasets
import mosestokenizer
import torch.nn as nn
import torch.optim as optim
import torch
from torch import Tensor
from typing import Tuple


tokenizer_en = mosestokenizer.MosesTokenizer('en')
tokenizer_de = mosestokenizer.MosesTokenizer('de')

BOS = '<s>'
EOS = '</s>'
PAD = '<pad>'

src = data.Field(sequential=True,
                 use_vocab=True,
                 pad_token=PAD,
                 tokenize=tokenizer_en,
                 lower=True,
                 include_lengths=True,

                ) #unk=0, pad=1

tgt = data.Field(sequential=True,
                 use_vocab=True,
                 pad_token=PAD,
                 tokenize=tokenizer_de,
                 lower=True,
                 init_token=BOS,
示例#9
0
from torchtext.data import Field, BucketIterator, interleave_keys
from torchtext.datasets import TranslationDataset
from torchtext.data import Example
import mosestokenizer
import torch

from typing import Tuple
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor

tokenizer_en = mosestokenizer.MosesTokenizer('en')
tokenizer_fr = mosestokenizer.MosesTokenizer('fr')

#src = Field(sequential=True,
#            use_vocab=True,
#            pad_token=PAD,
#            #tokenize=tokenizer_en,
#            lower=True,
#            batch_first=True)

BOS = '<s>'
EOS = '</s>'
PAD = '<pad>'

src = Field(sequential=True,
            use_vocab=True,
            pad_token=PAD,
            tokenize=tokenizer_en,
            lower=True,
示例#10
0
class MotherJokeGenerator:
    """
    Generator for mother jokes in russian language
    """

    morph = pymorphy2.MorphAnalyzer()
    tokenizer = mosestokenizer.MosesTokenizer()
    detokenizer = mosestokenizer.MosesDetokenizer()

    joke_beginning = "твоя мамка"

    duplicating_punctuation_remover = re.compile(r"(\W)\1+")
    punctuation_space_remover = re.compile(r"\s([.,:;?!](?:\s|$))")
    pair_punctuation_space_remover = re.compile(
        r'(["\[({<])\s*(.*?)\s*(["\])}>])')

    unsuitable_verbs = "быть бывать".split()
    suitable_verb_form = "буду".split()

    pronouns_p1 = "я меня мне меня мной мы нас нам нас нами".split()
    pronouns_p2 = "ты тебя тебе тебя тобой вы вас вам вас вами".split()

    prepositions_p1 = "обо со".split()
    prepositions_p2 = "о с".split()

    pronoun_flip_map = dict(
        chain(zip(pronouns_p1, pronouns_p2), zip(pronouns_p2, pronouns_p1)))
    preposition_flip_map = dict(
        chain(zip(prepositions_p1, prepositions_p2),
              zip(prepositions_p2, prepositions_p1)))

    preposition_flip_pronouns = set("мне тебе мной тобой".split())

    def __init__(
        self,
        min_token_count=3,
        max_token_count=15,
        min_verb_length=3,
        min_words_after_verb=1,
    ):
        """
        :param min_token_count: do not process sentences that have less than that number
        of tokens
        :param max_token_count: do not process sentences that have more than that number
        of tokens
        :param min_verb_length: limits minimal verb length (saves some time skipping
        small words)
        :param min_words_after_verb: minimal words number after a verb (joke is usually
        better with some trailing words)
        """

        super().__init__()

        self.min_token_count = min_token_count
        self.max_token_count = max_token_count
        self.min_word_length = min_verb_length
        self.min_words_after_verb = min_words_after_verb

    def get_joke(self, sentence):
        """
        Generate a joke from the sentence if it contains verbs.
        Morphological analysis is performed to find verbs and transform them.
        :returns joke string or None if was unable to generate a joke
        """

        clean_sentence = self.duplicating_punctuation_remover.sub(
            r"\1", sentence.lower()).strip(",.!?;:")
        tokens = self.tokenizer(clean_sentence)

        tokens_count = len(tokens)
        if self.min_token_count > tokens_count or tokens_count > self.max_token_count:
            return None

        inflected_verb_index = -1
        inflected_verb = None

        # finding last verb in the sentence and changing it's form
        # starting enumeration from 1 for regular index calculation convenience
        for reverse_index, word in enumerate(reversed(tokens), 1):
            if len(word) < self.min_word_length:
                continue

            # taking the most probable morphological analysis result
            word_morph = self.morph.parse(word)[0]
            is_verb = word_morph.tag.POS == "VERB"

            if not is_verb:
                continue

            if (word_morph.normal_form in self.unsuitable_verbs
                    and word not in self.suitable_verb_form):
                continue

            if reverse_index <= self.min_words_after_verb:
                break

            inflected_verb_morph = self._inflect_verb(word_morph)

            if inflected_verb_morph:
                inflected_verb_index = len(tokens) - reverse_index
                inflected_verb = inflected_verb_morph.word

            # last_good_verb might not exist here if failed to inflate the verb
            # will ignore sentence in such case
            break

        return (self._compile_joke(tokens[inflected_verb_index + 1:],
                                   inflected_verb) if inflected_verb else None)

    @staticmethod
    def _inflect_verb(verb_morph):
        """
        Change the form of the verb to correlate with the beginning of the joke
        """

        tense = verb_morph.tag.tense
        inflected_verb_morph = (verb_morph.inflect(
            {"VERB", "femn"}) if tense == "past" else verb_morph.inflect(
                {"VERB", "sing", "3per", tense}) if tense else None)
        return inflected_verb_morph

    def _flip_pronouns(self, rest_of_joke):
        """
        Reverse the pronouns, usually makes more sense this way round
        """

        for index, word in enumerate(rest_of_joke):
            if word in self.pronoun_flip_map:
                rest_of_joke[index] = self.pronoun_flip_map[word]
                prev_word = rest_of_joke[index - 1]
                if (index > 0 and word in self.preposition_flip_pronouns
                        and prev_word in self.preposition_flip_map):
                    rest_of_joke[index -
                                 1] = self.preposition_flip_map[prev_word]

    def _compile_joke(self, remaining_tokens, inflected_verb):
        """
        Final preparations and formatting
        """

        self._flip_pronouns(remaining_tokens)
        joke_ending = self.detokenizer(remaining_tokens)
        joke = f"{self.joke_beginning} {inflected_verb} {joke_ending}"
        joke = self.punctuation_space_remover.sub(r"\1", joke)
        joke = self.pair_punctuation_space_remover.sub(r"\1\2\3", joke)
        return joke
import os
import numpy as np
import time
import re
import pickle as pkl
from torch.utils.data import Dataset
import torch
import torch.nn as nn

import mosestokenizer

puncnormalize = mosestokenizer.MosesPunctuationNormalizer("en")
normalize = mosestokenizer.MosesTokenizer("en")

import os


def read_in_file(path, labeled=True):
    file = open(path, 'r', encoding='utf-8')
    qids = []
    questions = []
    labels = []
    line = file.readline()
    print(line)
    cnt = 0
    while True:
        line = file.readline()
        if line:
            try:
                qid, question = line.split(',', 1)
                if labeled: