def execute_by_device(self, device):
        for model_path in self.model_paths:
            insert_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                   action="insert",
                                                   force_reload=True,
                                                   device=device)
            substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                       action="substitute")

            for data in [self.text, self.texts]:
                self.insert(insert_aug, data)
                self.substitute(substitute_aug, data)
                self.substitute_stopwords(substitute_aug, data)
                self.top_k([insert_aug, substitute_aug], data)
                self.top_p([insert_aug, substitute_aug], data)
                self.top_k_top_p([insert_aug, substitute_aug], data)
                self.no_top_k_top_p([insert_aug, substitute_aug], data)
                self.decode_by_tokenizer([insert_aug, substitute_aug])
                self.no_candidiate([insert_aug, substitute_aug])

            self.subword([insert_aug, substitute_aug])
            self.max_length([insert_aug, substitute_aug])
            self.empty_replacement(substitute_aug)
            self.skip_short_token(substitute_aug)

        self.assertLess(0, len(self.model_paths))
示例#2
0
    def test_batch_size(self):
        # 1 per batch
        aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                        model_type='bert',
                                        batch_size=1)
        aug_data = aug.augment(self.texts)
        self.assertEqual(len(aug_data), len(self.texts))

        # batch size = input size
        aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                        model_type='bert',
                                        batch_size=len(self.texts))
        aug_data = aug.augment(self.texts)
        self.assertEqual(len(aug_data), len(self.texts))

        # batch size > input size
        aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                        model_type='bert',
                                        batch_size=len(self.texts) + 1)
        aug_data = aug.augment(self.texts)
        self.assertEqual(len(aug_data), len(self.texts))

        # input size > batch size
        aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                        model_type='bert',
                                        batch_size=2)
        aug_data = aug.augment(self.texts * 2)
        self.assertEqual(len(aug_data), len(self.texts) * 2)
    def execute_by_device(self, device):
        for model_path in self.model_paths:
            insert_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                   action="insert",
                                                   force_reload=True,
                                                   device=device)
            substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                       action="substitute",
                                                       force_reload=True,
                                                       device=device)

            self.oov([insert_aug, substitute_aug])
            self.insert(insert_aug)
            self.substitute(substitute_aug)
            self.substitute_stopwords(substitute_aug)
            self.subword([insert_aug, substitute_aug])
            self.not_substitute_unknown_word(substitute_aug)
            self.top_k([insert_aug, substitute_aug])
            self.top_p([insert_aug, substitute_aug])
            self.top_k_top_p([insert_aug, substitute_aug])
            self.no_top_k_top_p([insert_aug, substitute_aug])
            self.max_length([insert_aug, substitute_aug])
            self.empty_replacement(substitute_aug)

        self.assertLess(0, len(self.model_paths))
示例#4
0
    def execute_by_device(self, device):
        for model_path in self.model_paths:
            if self.debug:
                print('=============:', model_path)
            insert_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                   action="insert",
                                                   force_reload=True,
                                                   device=device)
            substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                       action="substitute",
                                                       device=device)

            if device == 'cpu':
                self.assertTrue(device == insert_aug.model.get_device())
                self.assertTrue(device == substitute_aug.model.get_device())
            elif 'cuda' in device:
                self.assertTrue('cuda' in insert_aug.model.get_device())
                self.assertTrue('cuda' in substitute_aug.model.get_device())

            for data in [self.text, self.texts]:
                self.insert(insert_aug, data)
                self.substitute(substitute_aug, data)
                if self.debug:
                    print('=============data:', data)
                self.substitute_stopwords(substitute_aug, data)
                self.decode_by_tokenizer([insert_aug, substitute_aug])

            self.subword([insert_aug, substitute_aug])
            self.max_length([insert_aug, substitute_aug])
            self.empty_replacement(substitute_aug)
            self.skip_short_token(substitute_aug)

        self.assertLess(0, len(self.model_paths))
示例#5
0
    def test_stopword_for_preprocess(self):
        stopwords = ["[id]", "[year]"]
        texts = [
            "My id is [id], and I born in [year]",  # with stopwords as last word
            "[id] id is [id], and I born in [year]",  # with stopwords as first word
            "[id] [id] Id is [year] [id]",  # continuous stopwords
            "[id]  [id] Id is [year]   [id]",  # continuous stopwords with space
            "My id is [id], and I   [id] born in [year] a[year] [year]b aa[year]",  # with similar stopwords
            "My id is [id], and I born [UNK] [year]",  # already have reserved word. NOT handling now
        ]
        expected_replaced_texts = [
            'My id is [UNK], and I born in [UNK]',
            '[UNK] id is [UNK], and I born in [UNK]',
            '[UNK] [UNK] Id is [UNK] [UNK]',
            '[UNK]  [UNK] Id is [UNK]   [UNK]',
            'My id is [UNK], and I   [UNK] born in [UNK] a[year] [year]b aa[year]',
            "My id is [UNK], and I born [UNK] [UNK]",
        ]
        expected_reserved_tokens = [['[year]', '[id]'],
                                    ['[year]', '[id]', '[id]'],
                                    ['[id]', '[year]', '[id]', '[id]'],
                                    ['[id]', '[year]', '[id]', '[id]'],
                                    ['[year]', '[id]', '[id]'],
                                    ['[year]', '[id]']]
        expected_reversed_texts = [
            'My id is [id], and I born in [year]',
            '[id] id is [id], and I born in [year]',
            '[id] [id] Id is [year] [id]', '[id]  [id] Id is [year]   [id]',
            'My id is [id], and I   [id] born in [year] a[year] [year]b aa[year]',
            'My id is [UNK], and I born [id] [year]'
        ]

        augs = [
            naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                      action="insert",
                                      stopwords=stopwords),
            naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                      action="substitute",
                                      stopwords=stopwords)
        ]

        for aug in augs:
            unknown_token = aug.model.get_unknown_token(
            ) or aug.model.UNKNOWN_TOKEN

            for expected_text, expected_reserved_token_list, expected_reversed_text, text in zip(
                    expected_replaced_texts, expected_reserved_tokens,
                    expected_reversed_texts, texts):
                replaced_text, reserved_stopwords = aug.replace_stopword_by_reserved_word(
                    text, aug.stopword_reg, unknown_token)
                assert expected_text == replaced_text
                assert expected_reserved_token_list == reserved_stopwords

                reversed_text = aug.replace_reserve_word_by_stopword(
                    replaced_text, aug.reserve_word_reg, reserved_stopwords)
                assert expected_reversed_text == reversed_text
    def __init__(self):
        aug0 = naw.RandomWordAug()
        aug1 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="substitute")
        aug2 = naw.SynonymAug(aug_src='wordnet')
        aug3 = naw.SplitAug()
        aug4 = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                         action="insert")

        self.augs = [aug0, aug1, aug2, aug3, aug4]
示例#7
0
    def test_reset_model(self):
        for model_path in self.model_paths:
            original_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                     action="insert",
                                                     force_reload=True)
            original_top_k = original_aug.model.top_k

            new_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                action="insert",
                                                force_reload=True,
                                                top_k=original_top_k + 1)
            new_top_k = new_aug.model.top_k

            self.assertEqual(original_top_k + 1, new_top_k)
示例#8
0
class QuestionGenerator:
    """Class contains logic for augmenting text"""

    aug = naw.ContextualWordEmbsAug(model_path="bert-base-uncased",
                                    action="substitute")
    aug_single = naw.SynonymAug(aug_src="wordnet")

    @staticmethod
    def augment(text):
        """
        checks whether to apply synonym or contextual augmentation

        :param text:
        :return:
        """
        tokens = split_sentence(re.sub("[^a-zA-Z0-9 ]+", "", text))
        if len(tokens) > 1:
            return QuestionGenerator.aug.augment(text, n=10, num_thread=4)
        else:
            return QuestionGenerator.aug_single.augment(tokens[0], n=10)

    @staticmethod
    async def generateQuestions(texts):
        """
        generates a list of variations for a given sentence/question

        :param texts: list of text
        :return: list of variations
        """
        if type(texts) == str:
            texts = [texts]

        result = [QuestionGenerator.augment(text) for text in texts]

        return sum(result, [])
示例#9
0
class QuestionGenerator:
    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                    action="substitute")
    aug_single = naw.SynonymAug(aug_src='wordnet')

    @staticmethod
    def augment(text):
        tokens = split_sentence(re.sub('[^a-zA-Z0-9 ]+', '', text))
        if len(tokens) > 1:
            return QuestionGenerator.aug.augment(text, n=10, num_thread=4)
        else:
            return QuestionGenerator.aug_single.augment(tokens[0], n=10)

    @staticmethod
    async def generateQuestions(texts):
        """ This function generates a list of variations for a given sentence/question.
            E.g. await QuestionGenerator.generateQuestions('your question') will return the list
            of variations for that particular question """

        if type(texts) == str:
            texts = [texts]

        result = [QuestionGenerator.augment(text) for text in texts]

        return sum(result, [])
示例#10
0
def bert_augment(text, aug_p=.2):

    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                    aug_p=aug_p)
    augmented_texts = aug.augment(text)

    return augmented_texts
 def test_quicktest(self):
     for model_path in self.model_paths:
         aug = naw.ContextualWordEmbsAug(model_path=model_path)
         text = 'The quick brown fox jumps over the lazaaaaaaaaay dog'
         augmented_text = aug.augment(text)
         # print('[{}]: {}'.format(model_path, augmented_text))
         self.assertNotEqual(text, augmented_text)
示例#12
0
    def execute_by_device(self, device):
        for model_path in self.model_paths:
            insert_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                   action="insert",
                                                   force_reload=True,
                                                   device=device)
            substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                       action="substitute",
                                                       force_reload=True,
                                                       device=device)

            self.oov([insert_aug, substitute_aug])
            self.insert(insert_aug)
            self.substitute(substitute_aug)
            self.substitute_stopwords(substitute_aug)

        self.assertLess(0, len(self.model_paths))
 def test_model_type(self):
     aug = naw.ContextualWordEmbsAug(
         model_path="blinoff/roberta-base-russian-v0",
         model_type='roberta',
         force_reload=True)
     aug.augment(
         "Мозг — это машина  которая пытается снизить ошибку в прогнозе.")
     self.assertTrue(True)
示例#14
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
def get_aug_type_1(top_k: int = 10,
                   aug_max: int = 5,
                   model_path: str = 'distilroberta-base',
                   action: str = 'substitute'):
    return naw.ContextualWordEmbsAug(model_path=model_path,
                                     action=action,
                                     top_k=top_k,
                                     stopwords=list(STOP_WORDS),
                                     aug_max=aug_max)
示例#16
0
    def test_reset_model(self):
        for model_path in self.model_paths:
            original_aug = naw.ContextualWordEmbsAug(
                    model_path=model_path, action="insert", force_reload=True, top_p=0.5)
            original_temperature = original_aug.model.temperature
            original_top_k = original_aug.model.top_k
            original_top_p = original_aug.model.top_p

            new_aug = naw.ContextualWordEmbsAug(
                model_path=model_path, action="insert", force_reload=True,
                temperature=original_temperature+1, top_k=original_top_k+1, top_p=original_top_p+1)
            new_temperature = new_aug.model.temperature
            new_top_k = new_aug.model.top_k
            new_top_p = new_aug.model.top_p

            self.assertEqual(original_temperature+1, new_temperature)
            self.assertEqual(original_top_k + 1, new_top_k)
            self.assertEqual(original_top_p + 1, new_top_p)
示例#17
0
    def test_multiprocess_gpu(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3
        aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda')

        augmented_texts = aug.augment(text, n=n, num_thread=n)
        self.assertGreater(len(augmented_texts), 1)
        for augmented_text in augmented_texts:
            self.assertNotEqual(augmented_text, text)
示例#18
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.augs = [
            nac.RandomCharAug(),
            naw.ContextualWordEmbsAug(),
            nas.ContextualWordEmbsForSentenceAug()
        ]
示例#19
0
def create_context_aug(params):
    return naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                     aug_p=params['aug_p'],
                                     top_k=params['top_k'],
                                     top_p=params['top_p'],
                                     aug_max=None,
                                     device='cuda',
                                     temperature=params['temperature'],
                                     stopwords=nltk_stopwords,
                                     stopwords_regex=r'".*"')
示例#20
0
def prepare_aug():
    # Contextual Word Embeddings Augmenter, Substitute word by contextual word embeddings
    neu_aug = []
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="insert"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',
                                  action="substitute"))
    neu_aug.append(
        naw.ContextualWordEmbsAug(model_path='roberta-base',
                                  action="substitute"))

    # Synonym Augmenter, Substitute word by WordNet's synonym
    syn_aug = []
    syn_aug.append(naw.SynonymAug(aug_src='wordnet'))
    syn_aug.append(
        naw.SynonymAug(
            aug_src='ppdb',
            model_path=
            '/home/ubuntu/sentiment_analysis/bert-sentiment/syn_model/ppdb-2.0-tldr'
        ))

    # Antonym Augmenter
    ant_aug = []
    ant_aug.append(naw.AntonymAug())

    # Random Word Augmenter
    random_aug = []
    random_aug.append(naw.RandomWordAug(action="swap"))
    random_aug.append(naw.RandomWordAug())

    print('augmenter initialization finished ...')
    aug = []
    aug.extend(neu_aug)
    aug.extend(syn_aug)
    aug.extend(ant_aug)
    aug.extend(random_aug)
    return aug
示例#21
0
    def test_multilingual(self):
        aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased')

        inputs = [
            {'lang': 'fra', 'text': "Bonjour, J'aimerais une attestation de l'employeur certifiant que je suis en CDI."},
            {'lang': 'jap', 'text': '速い茶色の狐が怠惰なな犬を飛び越えます'},
            {'lang': 'spa', 'text': 'un rapido lobo marron salta sobre el perro perezoso'}
        ]

        for input_param in inputs:
            augmented_text = aug.augment(input_param['text'])
            self.assertNotEqual(input_param['text'], augmented_text)
示例#22
0
def run_core():
    print(datetime.datetime.now(), 'before import')
    import nlpaug.augmenter.word as naw

    print(datetime.datetime.now(), 'before init')
    aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                                    model_type="bert",
                                    use_custom_api=True)
    text = 'The quick brown fox jumps over the lazy dog.'
    print(datetime.datetime.now(), 'before augment')
    aug.augment([text] * 2)
    print(datetime.datetime.now(), 'done')
    def __init__(self, template, output_file, augmentation_factor=5):
        assert augmentation_factor >= 2
        self.augmentation_factor = augmentation_factor

        self.base_file = template
        self.output_file = output_file

        self.dataset = {}
        self.intents = {}

        self.character_augmenter = nac.OcrAug()
        self.word_augmenter = naw.ContextualWordEmbsAug()
示例#24
0
    def execute_by_device(self, device):
        for model_path in self.model_paths:
            insert_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                   action="insert",
                                                   force_reload=True,
                                                   device=device)
            substitute_aug = naw.ContextualWordEmbsAug(model_path=model_path,
                                                       action="substitute",
                                                       force_reload=True,
                                                       device=device)

            self.oov([insert_aug, substitute_aug])
            self.insert(insert_aug)
            self.substitute(substitute_aug)
            self.substitute_stopwords(substitute_aug)
            self.subword([insert_aug, substitute_aug])

            # Must be last one as it changed properties.
            self.not_substitute_unknown_word(substitute_aug)

        self.assertLess(0, len(self.model_paths))
示例#25
0
    def test_non_strip_input(self):
        text = ' Good boy '

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertNotEqual(text, augmented_text)
示例#26
0
    def test_skip_punctuation(self):
        text = '. . . . ! ? # @'

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            self.assertEqual(text, augmented_text)
示例#27
0
    def test_empty_input_for_insert(self):
        text = ' '

        augs = [
            naw.ContextualWordEmbsAug(action="insert"),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            augmented_text = aug.augment(text)
            # FIXME: standardize return
            is_equal = augmented_text == '' or augmented_text == ' '
            self.assertTrue(is_equal)
示例#28
0
def main(args):
    df = pd.read_csv(args.input_csv, index_col=0)
    kwargs = {
        'top_k': 10,
        'action': 'insert',
        'model_path': args.bert_path,
        'aug_min': 2,
        'aug_max': 4,
        'stopwords': get_stopwords(args.stopwords)
    }
    augmenter = naw.ContextualWordEmbsAug(device=args.device, **kwargs)
    augmented = augment_dataframe(df, augmenter, args.text_key, batch_size=2)
    augmented.to_csv(args.output_csv)
示例#29
0
def main():
	model_paths = [
	#     'distilbert-base-uncased',
	    'bert-base-uncased',
	#     'bert-base-cased',
	#     'xlnet-base-cased',
	    # 'roberta-base',
	#     'distilroberta-base'
	]	
	for model_path in model_paths:
	    print('-----------------:', model_path)
	    aug = naw.ContextualWordEmbsAug(model_path=model_path)
	    text = 'The quick brown fox jumps over the lazaaaaaaaaay dog'
	    augmented_text = aug.augment([text]*2)
示例#30
0
    def test_excessive_space(self):
        # https://github.com/makcedward/nlpaug/issues/48
        text = 'The  quick brown fox        jumps over the lazy dog . 1  2 '
        expected_result = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '1', '2']

        augs = [
            naw.ContextualWordEmbsAug(action='insert'),
            naw.AntonymAug(),
            naw.TfIdfAug(model_path=os.environ.get("MODEL_DIR"), action="substitute")
        ]

        for aug in augs:
            tokenized_text = aug._tokenizer(text)
            self.assertEqual(tokenized_text, expected_result)