示例#1
0
    def test_n_output_without_augmentation(self):
        texts = [
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential([
                nac.OcrAug(),
                nac.OcrAug()
            ]),
            naf.Sometimes([
                nac.RandomCharAug(),
                nac.RandomCharAug()
            ], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    all_not_equal = False
                    for augmented_text in augmented_texts:
                        if augmented_text != text:
                            all_not_equal = True
                            break
                    if all_not_equal:
                        break

                self.assertFalse(all_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
示例#2
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.5)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True

                    self.assertLess(0, len(text))

                if at_least_one_not_equal:
                    break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
def augment_text_ocr(comment):
    aug = nac.OcrAug(aug_char_p=0.3, aug_word_p=0.4, aug_word_min=len(comment))
    try:
        augmented_texts = aug.augment(comment, n=1)
    except:
        augmented_texts = None
    return augmented_texts
示例#4
0
def char_level(text, n):
    #Augmenting data in character level.
    aug = nac.OcrAug()
    attacked_texts = aug.augment(text, n=n)
    # gives n forms of augmentation (n is the number of augmented forms a user wants)
    print("Attacked Text:")
    print(attacked_texts)
示例#5
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sequential([
                nac.OcrAug(),
                nac.KeyboardAug(aug_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))
示例#6
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
示例#7
0
    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.9),
                naf.Sequential(
                    [
                        # nac.OcrAug(), nac.QwertyAug(aug_min=1),
                        nac.RandomCharAug(action="substitute",
                                          aug_char_min=1,
                                          aug_char_p=0.6,
                                          aug_word_p=0.6)
                    ],
                    name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.9)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            for text in texts:
                at_least_one_not_equal = False
                for _ in range(5):
                    augmented_text = flow.augment(text, n=1)

                    if text != augmented_text:
                        at_least_one_not_equal = True
                        break

                self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
示例#8
0
def augment_text_ocr(comment):
    """
  OCRAug adds noise to a comment by replacing the target characters with predefined mapping table 
  """

    aug = nac.OcrAug(aug_char_p=0.3, aug_word_p=0.4, aug_word_min=len(comment))
    try:
        augmented_texts = aug.augment(comment, n=1)
    except:
        augmented_texts = None
    return augmented_texts
示例#9
0
def nlpaug(word):
    aug = naf.Sometimes([
        nac.OcrAug(),
        nac.KeyboardAug(),
        nac.RandomCharAug(action="insert"),
        nac.RandomCharAug(action="substitute"),
        nac.RandomCharAug(action="swap"),
        nac.RandomCharAug(action="delete"),
        naw.SpellingAug(),
    ])
    word = aug.augment(word)
    return word
    def __init__(self, template, output_file, augmentation_factor=5):
        assert augmentation_factor >= 2
        self.augmentation_factor = augmentation_factor

        self.base_file = template
        self.output_file = output_file

        self.dataset = {}
        self.intents = {}

        self.character_augmenter = nac.OcrAug()
        self.word_augmenter = naw.ContextualWordEmbsAug()
示例#11
0
    def test_n_output_without_augmentation(self):
        texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA']
        flows = [
            naf.Sequential([nac.OcrAug(), nac.OcrAug()]),
            naf.Sometimes(
                [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                at_least_one_equal = False
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    if len(augmented_texts
                           ) == 1 and augmented_texts[0] == text:
                        at_least_one_equal = True
                        break

                self.assertTrue(at_least_one_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))
示例#12
0
    def test_empty(self):
        texts = ['', None]

        augs = [
            nac.OcrAug(),
            nac.KeyboardAug(),
        ]

        for text in texts:
            for aug in augs:
                augmented_text = aug.augment(text)
                self.assertEqual(text, augmented_text)
示例#13
0
    def test_special_char(self):
        text = '#'
        aug = nac.KeyboardAug(min_char=1)
        augmented_text = aug.augment(text)
        self.assertNotEqual(text, augmented_text)

        # No mapping, return original value
        text = '~'
        augs = [nac.KeyboardAug(min_char=1), nac.OcrAug(min_char=1)]
        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
示例#14
0
    def test_stopwords_regex(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

        augs = [
            nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex),
            nac.KeyboardAug(stopwords_regex=stopwords_regex),
            nac.OcrAug(stopwords_regex=stopwords_regex)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
示例#15
0
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            nac.RandomCharAug(stopwords=stopwords),
            nac.KeyboardAug(stopwords=stopwords),
            nac.OcrAug(stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
示例#16
0
    def test_empty_input_for_substitute(self):
        texts = ['', '           ']
        augs = [
            nac.RandomCharAug(action='substitute'),
            nac.KeyboardAug(),
            nac.OcrAug()
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')
示例#17
0
    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ],
                           include_detail=True),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=1,
                          include_detail=True)
        ]

        for flow in flows:
            augmented_text, augment_details = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())
示例#18
0
    def test_min_char(self):
        text = 'He eats apple'
        augs = [
            nac.RandomCharAug(min_char=5),
            nac.KeyboardAug(min_char=5),
            nac.OcrAug(min_char=5)
        ]

        for aug in augs:
            augmented = False
            for i in range(10):
                augmented_text = aug.augment(text)
                if 'apple' not in augmented_text:
                    augmented = True
                    break

            self.assertTrue(augmented)
示例#19
0
    def test_tokenizer(self):
        augs = [
            nac.OcrAug(tokenizer=text_tokenizer.split_sentence),
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        text = 'The quick brown fox, jumps over lazy dog.'
        expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)

        text = 'The quick !brown fox, jumps # over lazy dog .'
        expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)
示例#20
0
    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'
        augs = [
            nac.KeyboardAug(min_char=1, include_detail=True),
            nac.OcrAug(min_char=1, include_detail=True),
            nac.RandomCharAug(min_char=2, include_detail=True)
        ]

        for aug in augs:
            augmented_text, augment_details = aug.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertTrue(augment_detail['orig_token'] in text)
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())

            # Get back original input by re-engineering
            reengineering_text = augmented_text
            for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True):
                if change_obj['action'] == Action.DELETE:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = change_obj['orig_token'] + ' '
                    text_suffix = reengineering_text[change_obj['new_start_pos']:]

                elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = reengineering_text[change_obj['new_start_pos']:].replace(
                        change_obj['new_token'], change_obj['orig_token'], 1)
                    text_suffix = ''
                # TODO
                # elif change_obj['action'] in Action.SWAP:

                reengineering_text = text_prefix + text_core + text_suffix
                reengineering_text = reengineering_text.strip()

            self.assertEqual(text, reengineering_text)
def ocr_aug(corpus):
    aug = nac.OcrAug(tokenizer=whitespace_tokenizer)
    # go through all train and dev sentences
    augmented_sentences = []
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3)
        for augmented_text in augmented_texts:
            augmented_sentence: Sentence = Sentence()
            augmented_token_texts = augmented_text.split(" ")
            for augmented_token_text, original_token in zip(augmented_token_texts, sentence):
                # make a new token
                augmented_token = Token(augmented_token_text)
                # transfer annotations over to augmented token
                augmented_token.annotation_layers = original_token.annotation_layers
                # add augmented token to augmented sentence
                augmented_sentence.add_token(augmented_token)
            # add augmented sentence to list of all augmented sentences
            augmented_sentences.append(augmented_sentence)

    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)

    return corpus
示例#22
0

def augment_image(
    dataset: list,
    augmenter,
):
    """
    a generic augment process on generator
    :param dataset: dataset generator (batch)
    :param augmenter:
    :return:
    """
    for data_point in dataset:
        data, label = data_point
        if augmenter is not None:
            data = augmenter(images=data)
        yield np.asarray(data), np.asarray(label)


# NULL augmenter
none_augmenter = partial(augment_text, augmenter=None)

# OCR error augmenter
text_ocr_augmenter = partial(augment_text, augmenter=nac.OcrAug())

# flip_r image augmenter
image_flip_r_augmenter = partial(augment_image, augmenter=iaa.Fliplr())

# rotation image augmenter
image_rot_augmenter = partial(augment_image, augmenter=iaa.Rotate())
示例#23
0
import re
import attr
import random
import pandas as pd
from faker import Faker
from .utility import formating
from .CONSTATNTS import *
import nlpaug.augmenter.char as nac

fake = Faker()
aug = nac.OcrAug()


@attr.s
class DataGeneration:
    template_list = attr.ib()
    number = attr.ib()

    @property
    def data(self):
        return_data = []
        const_data = self.constant_generation()
        for i in range(0, self.number):
            temp, class_3_label = random.choice(self.template_list)
            class_2_label = class_3_label
            if class_3_label == 'partial-addressline':
                class_2_label = 'addressline'
            matcher = re.findall('\{.*?\}', temp)
            for i in matcher:
                key = i.replace('{', '').replace('}', '')
                tp = random.choice(const_data[key])
示例#24
0
def augment_dataset(input_df):
    """
    Augmenting the dataset based on NLP aug library. If the dataset is small, this is a great way to boost things up.
    But you do not want to apply augmentation on the Doctor's response. These should not have any spelling mistakes. 
    - The augmentation that will be done here is character level augmentations and word level augmentations:
    - OCR error augmentation (character level)
    - Keyboard augmentation (character level)
    - Synonym augmenter (word level)
    """

    print('Augmenting the dataset based on Synonyms...')

    ocr = nac.OcrAug()
    response_OCR = []
    context_OCR = []

    keyboard = nac.KeyboardAug()
    response_keyboard = []
    context_keyboard = []

    synonym = naw.SynonymAug(aug_src='wordnet')
    response_synonym = []
    context_synonym = []

    for i in input_df.index:

        if i % 10 == 0:
            print('processing {}th line'.format(i))

        response = input_df['response'][i]
        context = input_df['context'][i]

        #augmentation
        ocr_augmented_line = ocr.augment(context, n=3)
        response_OCR.append(response)
        context_OCR.append(ocr_augmented_line)

        #keyboard augmentation
        keyboard_augmented_line = keyboard.augment(context)
        response_keyboard.append(response)
        context_keyboard.append(keyboard_augmented_line)

        #synonym augmentation
        synonym_augmented_line = synonym.augment(context)
        response_synonym.append(response)
        context_synonym.append(synonym_augmented_line)

    ocr_augmented_data = {'response': response_OCR, 'context': context_OCR}
    ocr_df = pd.DataFrame.from_dict(ocr_augmented_data)

    keyboard_augmented_data = {
        'response': response_keyboard,
        'context': context_keyboard
    }
    keyboard_df = pd.DataFrame.from_dict(keyboard_augmented_data)

    synonym_augmented_data = {
        'response': response_synonym,
        'context': context_synonym
    }
    synonym_df = pd.DataFrame.from_dict(synonym_augmented_data)

    augmented_1 = input_df.append(ocr_df, ignore_index=True)
    augmented_2 = augmented_1.append(keyboard_df, ignore_index=True)
    augmented_3 = augmented_2.append(synonym_df, ignore_index=True)

    print('original dataset length: {}'.format(len(input_df)))
    print('Augmented dataset length: {}'.format(len(augmented_2)))

    return augmented_3