Python RandomCharAug 예제들, nlpaug.augmenter.char.RandomCharAug Python 예제들

예제 #1

0

파일 보기

파일: test_sequential.py 프로젝트: veeraelluru/nlpaug

    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sequential([
                nac.OcrAug(),
                nac.KeyboardAug(aug_min=1),
                nac.RandomCharAug(action=Action.SUBSTITUTE,
                                  aug_min=1,
                                  aug_char_p=0.6,
                                  aug_word_p=0.6)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_text = flow.augment(text)

                self.assertNotEqual(text, augmented_text)
                self.assertLess(0, len(text))

            self.assertLess(0, len(texts))

        self.assertLess(0, len(flows))

예제 #2

0

파일 보기

파일: test_flow.py 프로젝트: x10-utils/nlpaug

    def test_n_output_without_augmentation(self):
        texts = [
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential([
                nac.OcrAug(),
                nac.OcrAug()
            ]),
            naf.Sometimes([
                nac.RandomCharAug(),
                nac.RandomCharAug()
            ], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    all_not_equal = False
                    for augmented_text in augmented_texts:
                        if augmented_text != text:
                            all_not_equal = True
                            break
                    if all_not_equal:
                        break

                self.assertFalse(all_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))

예제 #3

0

파일 보기

파일: aug_utils.py 프로젝트: jzhai321/nlp-qa-finalproj

def nlpaug(word):
    aug = naf.Sometimes([
        nac.OcrAug(),
        nac.KeyboardAug(),
        nac.RandomCharAug(action="insert"),
        nac.RandomCharAug(action="substitute"),
        nac.RandomCharAug(action="swap"),
        nac.RandomCharAug(action="delete"),
        naw.SpellingAug(),
    ])
    word = aug.augment(word)
    return word

예제 #4

0

파일 보기

파일: test_flow.py 프로젝트: xingbow/nlpaug

    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)

예제 #5

0

파일 보기

파일: test_text_augmenter.py 프로젝트: xiaosongyuan/nlpaug

    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.augs = [
            nac.RandomCharAug(),
            naw.ContextualWordEmbsAug(),
            nas.ContextualWordEmbsForSentenceAug()
        ]

예제 #6

0

파일 보기

파일: test_flow.py 프로젝트: liamge/nlpaug

    def test_n_output_without_augmentation(self):
        texts = ['AAAAAAAAAAA AAAAAAAAAAAAAA']
        flows = [
            naf.Sequential([nac.OcrAug(), nac.OcrAug()]),
            naf.Sometimes(
                [nac.RandomCharAug(), nac.RandomCharAug()], pipeline_p=0.00001)
        ]

        for flow in flows:
            for text in texts:
                at_least_one_equal = False
                for _ in range(5):
                    augmented_texts = flow.augment(text, n=3)
                    if len(augmented_texts
                           ) == 1 and augmented_texts[0] == text:
                        at_least_one_equal = True
                        break

                self.assertTrue(at_least_one_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))

예제 #7

0

파일 보기

파일: test_char.py 프로젝트: shenjiawei19/nlpaug

    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        n = 3
        augs = [
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)

예제 #8

0

파일 보기

    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.QwertyAug(aug_min=1),
                    nac.RandomCharAug(action=Action.SUBSTITUTE,
                                      aug_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.5)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            at_least_one_not_equal = False
            for _ in range(0, 5):
                for text in texts:
                    self.assertLess(0, len(text))
                    augmented_text = flow.augment(text)

                    if text != augmented_text:
                        at_least_one_not_equal = True

                    self.assertLess(0, len(text))

                if at_least_one_not_equal:
                    break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))

예제 #9

0

파일 보기

    def test_empty_input_for_insert(self):
        texts = ['', '           ']
        augs = [
            nac.RandomCharAug(action='insert')
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

예제 #10

0

파일 보기

    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            nac.RandomCharAug(stopwords=stopwords),
            nac.KeyboardAug(stopwords=stopwords),
            nac.OcrAug(stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)

예제 #11

0

파일 보기

    def test_stopwords_regex(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

        augs = [
            nac.RandomCharAug(action="delete", stopwords_regex=stopwords_regex),
            nac.KeyboardAug(stopwords_regex=stopwords_regex),
            nac.OcrAug(stopwords_regex=stopwords_regex)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)

예제 #12

0

파일 보기

파일: test_sequential.py 프로젝트: x10-utils/nlpaug

    def test_single_action(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584 s@#'
        ]

        flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)])

        for text in texts:
            augmented_text = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertLess(0, len(text))

        self.assertLess(0, len(texts))

예제 #13

0

파일 보기

    def test_empty_input_for_substitute(self):
        texts = ['', '           ']
        augs = [
            nac.RandomCharAug(action='substitute'),
            nac.KeyboardAug(),
            nac.OcrAug()
        ]

        for aug in augs:
            for text in texts:
                augmented_text = aug.augment(text)
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

            augmented_texts = aug.augment(texts)
            for augmented_text in augmented_texts:
                self.assertTrue(augmented_text is None or augmented_text.strip() == '')

예제 #14

0

파일 보기

파일: test_flow.py 프로젝트: dataframing/nlpaug

    def test_multiple_actions(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584'
        ]

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.9),
                naf.Sequential(
                    [
                        # nac.OcrAug(), nac.QwertyAug(aug_min=1),
                        nac.RandomCharAug(action="substitute",
                                          aug_char_min=1,
                                          aug_char_p=0.6,
                                          aug_word_p=0.6)
                    ],
                    name='Sub_Seq')
            ]),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=0.9)
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        for flow in flows:
            for text in texts:
                at_least_one_not_equal = False
                for _ in range(5):
                    augmented_text = flow.augment(text, n=1)

                    if text != augmented_text:
                        at_least_one_not_equal = True
                        break

                self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))

예제 #15

0

파일 보기

파일: test_char.py 프로젝트: shenjiawei19/nlpaug

    def test_min_char(self):
        text = 'He eats apple'
        augs = [
            nac.RandomCharAug(min_char=5),
            nac.KeyboardAug(min_char=5),
            nac.OcrAug(min_char=5)
        ]

        for aug in augs:
            augmented = False
            for i in range(10):
                augmented_text = aug.augment(text)
                if 'apple' not in augmented_text:
                    augmented = True
                    break

            self.assertTrue(augmented)

예제 #16

0

파일 보기

파일: data_generation.py 프로젝트: dannyfriar/lazydate

def apply_noise(datestr: str, format_dict: Dict[str, str],
                noise_dict: Dict[str, Any]) -> str:
    sep = format_dict["separator"]
    sep = sep[0] if len(sep) > 1 else sep
    date_parts = datestr.split(sep)

    if noise_dict["append_day_suffix"]:
        date_parts[0] = date_parts[0] + noise_dict["day_suffix"]

    # Add spelling mistake to month name
    if len(format_dict["month"]) > 2 and np.random.random() <= 0.3:
        aug = nac.RandomCharAug(
            action=noise_dict["aug_char_action"],
            aug_char_min=1,
            aug_char_max=1,
        )
        date_parts[1] = aug.augment(date_parts[1])

    out = ""
    for idx, date_part in enumerate(date_parts):
        part_sep = sep
        rand_val = np.random.random()
        if noise_dict["noisy_separator"] and rand_val <= 0.15:
            part_sep += " "
        if noise_dict["noisy_separator"] and rand_val <= 0.15:
            part_sep = " " + part_sep
        elif noise_dict["noisy_separator"] and rand_val <= 0.5:
            part_sep += "".join(
                np.random.choice(ADDITIONAL_PUNCTUATION, size=2))

        if idx == 0:
            out += date_part
        else:
            out += f"{part_sep}{date_part}"

    # out = f"{sep}".join(date_parts)
    if noise_dict["casing"] == "uppercase":
        out = out.upper()
    elif noise_dict["casing"] == "lowercase":
        out = out.lower()

    if noise_dict["place_in_sentence"]:
        out = put_datestr_in_sentence(out, noise_dict["sentence"])

    return out

예제 #17

0

파일 보기

파일: test_char.py 프로젝트: veeraelluru/nlpaug

    def test_tokenizer(self):
        augs = [
            nac.OcrAug(tokenizer=text_tokenizer.split_sentence),
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        text = 'The quick brown fox, jumps over lazy dog.'
        expected_tokens = ['The', ' quick', ' brown', ' fox', ', ', 'jumps', ' over', ' lazy', ' dog', '.']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)

        text = 'The quick !brown fox, jumps # over lazy dog .'
        expected_tokens = ['The', ' quick', ' !', 'brown', ' fox', ', ', 'jumps', ' # ', 'over', ' lazy', ' dog', ' .']
        for aug in augs:
            tokens = aug.tokenizer(text)
            self.assertEqual(tokens, expected_tokens)

예제 #18

0

파일 보기

파일: test_augmenter.py 프로젝트: x10-utils/nlpaug

    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '.env'))
        load_dotenv(env_config_path)
        # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
        cls.sample_wav_file = os.environ.get(
            "DATA_DIR") + 'Yamaha-V50-Rock-Beat-120bpm.wav'
        cls.audio, cls.sampling_rate = librosa.load(cls.sample_wav_file)

        cls.textual_augs = [
            nac.RandomCharAug(),
            naw.ContextualWordEmbsAug(),
            nas.ContextualWordEmbsForSentenceAug()
        ]

        cls.audio_augs = [
            naa.CropAug(sampling_rate=cls.sampling_rate),
            naa.SpeedAug(),
        ]

예제 #19

0

파일 보기

파일: test_char.py 프로젝트: shenjiawei19/nlpaug

    def test_multi_inputs(self):
        texts = [
            'The quick brown fox jumps over the lazy dog.',
            'The quick brown fox jumps over the lazy dog.',
            'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )',
            'nac KeyboardAug ( tokenizer = text_tokenizer . split_sentence )'
        ]
        augs = [
            nac.KeyboardAug(tokenizer=text_tokenizer.split_sentence),
            nac.RandomCharAug(tokenizer=text_tokenizer.split_sentence),
        ]

        num_thread = 2
        for aug in augs:
            augmented_data = aug.augment(texts, num_thread=num_thread)
            self.assertEqual(len(augmented_data), len(texts))

        num_thread = 1
        for aug in augs:
            augmented_data = aug.augment(texts, num_thread=num_thread)
            self.assertEqual(len(augmented_data), len(texts))

예제 #20

0

파일 보기

    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'
        augs = [
            nac.KeyboardAug(min_char=1, include_detail=True),
            nac.OcrAug(min_char=1, include_detail=True),
            nac.RandomCharAug(min_char=2, include_detail=True)
        ]

        for aug in augs:
            augmented_text, augment_details = aug.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertTrue(augment_detail['orig_token'] in text)
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())

            # Get back original input by re-engineering
            reengineering_text = augmented_text
            for change_obj in sorted(augment_details, key=lambda item: item['orig_start_pos'], reverse=True):
                if change_obj['action'] == Action.DELETE:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = change_obj['orig_token'] + ' '
                    text_suffix = reengineering_text[change_obj['new_start_pos']:]

                elif change_obj['action'] in [Action.INSERT, Action.SUBSTITUTE]:
                    text_prefix = reengineering_text[:change_obj['new_start_pos']]
                    text_core = reengineering_text[change_obj['new_start_pos']:].replace(
                        change_obj['new_token'], change_obj['orig_token'], 1)
                    text_suffix = ''
                # TODO
                # elif change_obj['action'] in Action.SWAP:

                reengineering_text = text_prefix + text_core + text_suffix
                reengineering_text = reengineering_text.strip()

            self.assertEqual(text, reengineering_text)

예제 #21

0

파일 보기

파일: test_flow.py 프로젝트: zoeyhub/nlpaug

    def test_augment_detail(self):
        text = 'The quick brown fox jumps over the lazy dog'

        flows = [
            naf.Sequential([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ],
                              pipeline_p=0.5),
                naf.Sequential([
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ],
                               name='Sub_Seq')
            ],
                           include_detail=True),
            naf.Sometimes([
                naf.Sometimes([
                    nac.RandomCharAug(action="insert"),
                    nac.RandomCharAug(action="delete")
                ]),
                naf.Sequential([
                    nac.OcrAug(),
                    nac.KeyboardAug(aug_char_min=1),
                    nac.RandomCharAug(action="substitute",
                                      aug_char_min=1,
                                      aug_char_p=0.6,
                                      aug_word_p=0.6)
                ])
            ],
                          pipeline_p=1,
                          include_detail=True)
        ]

        for flow in flows:
            augmented_text, augment_details = flow.augment(text)

            self.assertNotEqual(text, augmented_text)
            self.assertGreater(len(augment_details), 0)
            for augment_detail in augment_details:
                self.assertGreater(augment_detail['orig_start_pos'], -1)
                self.assertGreater(augment_detail['new_start_pos'], -1)
                self.assertGreater(augment_detail['change_seq'], 0)
                self.assertIn(augment_detail['action'], Action.getall())

예제 #22

0

파일 보기

파일: test_sometimes.py 프로젝트: wshBak/nlpaug

    def test_single_action(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584 s@#'
        ]

        # Since prob may be low and causing do not perform data augmentation. Retry 5 times
        at_least_one_not_equal = False
        for _ in range(0, 5):
            flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6)
            for text in texts:
                augmented_text = flow.augment(text)

                if text != augmented_text:
                    at_least_one_not_equal = True

                self.assertLess(0, len(text))

            if at_least_one_not_equal:
                break

        self.assertTrue(at_least_one_not_equal)
        self.assertLess(0, len(texts))

예제 #23

0

파일 보기

파일: nlpaugment.py 프로젝트: TatianaMoteuN/data-augmentation-for-NLP

def random_delete_aug(corpus):
    aug = nac.RandomCharAug(tokenizer=whitespace_tokenizer, action="delete")
    # go through all train and dev sentences
    augmented_sentences = []
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence.to_tokenized_string(), n=3)
        for augmented_text in augmented_texts:
            augmented_sentence: Sentence = Sentence()
            augmented_token_texts = augmented_text.split(" ")
            for augmented_token_text, original_token in zip(augmented_token_texts, sentence):
                # make a new token
                augmented_token = Token(augmented_token_text)
                # transfer annotations over to augmented token
                augmented_token.annotation_layers = original_token.annotation_layers
                # add augmented token to augmented sentence
                augmented_sentence.add_token(augmented_token)
            # add augmented sentence to list of all augmented sentences
            augmented_sentences.append(augmented_sentence)

    corpus = Corpus(train=SentenceDataset(augmented_sentences),
                    dev=corpus.dev,
                    test=corpus.test)

    return corpus

예제 #24

0

파일 보기

파일: test_flow.py 프로젝트: liamge/nlpaug

    def test_n_output(self):
        texts = [
            'The quick brown fox jumps over the lazy dog',
            'Zology raku123456 fasdasd asd4123414 1234584',
            'AAAAAAAAAAA AAAAAAAAAAAAAA'
        ]
        flows = [
            naf.Sequential(
                [nac.RandomCharAug(action=Action.INSERT),
                 naw.RandomWordAug()]),
            naf.Sometimes([
                nac.RandomCharAug(action=Action.INSERT),
                nac.RandomCharAug(action=Action.DELETE)
            ],
                          pipeline_p=0.9),
            naf.Sequential([
                naf.Sequential([
                    nac.RandomCharAug(action=Action.INSERT),
                    naw.RandomWordAug()
                ]),
                naf.Sometimes([
                    nac.RandomCharAug(action=Action.INSERT),
                    nac.RandomCharAug(action=Action.DELETE)
                ],
                              pipeline_p=0.9)
            ])
        ]

        for flow in flows:
            for text in texts:
                augmented_texts = flow.augment(text, n=3)
                self.assertGreater(len(augmented_texts), 1)
                for augmented_text in augmented_texts:
                    self.assertNotEqual(augmented_text, text)

        self.assertLess(0, len(flows))
        self.assertLess(0, len(texts))

예제 #25

0

파일 보기

def random_char_swap(text):
    #Swap character randomly
    aug = nac.RandomCharAug(action="swap")
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)

예제 #26

0

파일 보기

def random_char_del(text):
    #Delete character randomly
    aug = nac.RandomCharAug(action="delete")
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)

예제 #27

0

파일 보기

파일: negative_data.py 프로젝트: yaliqin/chatbot

        rows = []
        for row in csv_reader:
            rows.append(row)

        for row in list(rows): # Substitute character by keyboard distance
            if row[1] != "flag" and row[1] !='R' and row[1]!="":
                counter += 1
                if counter != 2:
                    row[2] = str(keyboard_dis.augment(row[2]))
                if counter == 3:
                    counter = 0
            csv_writer.writerow(row)

with open("input_classification_test_data.csv","r") as input:
    with open("/Users/wenyaxie/Downloads/negative_data_character_insertion.csv","w") as output:
        random_insert = nac.RandomCharAug(action="insert")

        csv_reader = csv.reader(input)
        csv_writer = csv.writer(output)
        counter = 0

        rows = []
        for row in csv_reader:
            rows.append(row)

        for row in list(rows): # Insert character randomly
            if row[1] != "flag" and row[1] !='R' and row[1]!="":
                counter += 1
                if counter != 2:
                    row[2] = str(random_insert.augment(row[2]))
                if counter == 3:

예제 #28

0

파일 보기

def random_char_subsi(text):
    # Substitute character randomly
    aug = nac.RandomCharAug(action="substitute")
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)

예제 #29

0

파일 보기

파일: DataCorruptor.py 프로젝트: NautiyalAmit/Data-Science-Application-Feature-Selection

import warnings
import pandas as pd
import numpy as np
from numpy.random import choice
import nlpaug.augmenter.char as nac

from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

warnings.filterwarnings('ignore')

swapRandom = nac.RandomCharAug(action="swap")
replaceTwoCharsBasedOnKeyboard = nac.KeyboardAug()
deleteRandomChar = nac.RandomCharAug(action="delete")

np.random.seed(0)  # Setting seed globally


# r = np.random.RandomState(0) TODO: Setting the seed for the class locally without impacting global numpy seed


class DataCorruptor:

    def __init__(self, data, feature_cols, feature_stats=None, log=False):
        # np.random.seed(0)

        if feature_stats is None:
            # TODO: Take the cardinlal statistics (like most common value), into account while corupting data
            self.feature_stats = data.describe().T[['mean', 'std', 'max', 'min']]
        else:
            self.feature_stats = feature_stats

예제 #30

0

파일 보기

def random_char_insert(text):
    # Insert character randomly
    aug = nac.RandomCharAug(action="insert")
    attacked_text = aug.augment(text)
    print("Attacked Text:")
    print(attacked_text)