示例#1
0
    def test_reset_top_k(self):
        original_aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path)
        original_top_k = original_aug.model.top_k

        new_aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path,
            top_k=original_top_k+1)
        new_top_k = new_aug.model.top_k

        self.assertEqual(original_top_k+1, new_top_k)
示例#2
0
    def test_reset_top_k(self):
        original_aug = naw.WordEmbsAug(model_type='word2vec',
                                       model_path=os.environ.get("MODEL_DIR") +
                                       'GoogleNews-vectors-negative300.bin')
        original_top_k = original_aug.model.top_k

        new_aug = naw.WordEmbsAug(model_type='word2vec',
                                  model_path=os.environ.get("MODEL_DIR") +
                                  'GoogleNews-vectors-negative300.bin',
                                  top_k=original_top_k + 1)
        new_top_k = new_aug.model.top_k

        self.assertEqual(original_top_k + 1, new_top_k)
示例#3
0
    def test_incorrect_model_type(self):
        with self.assertRaises(ValueError) as error:
            naw.WordEmbsAug(
                model_type='test_model_type',
                model_path=os.environ.get("MODEL_DIR") + 'GoogleNews-vectors-negative300.bin')

        self.assertTrue('Model type value is unexpected.' in str(error.exception))
示例#4
0
    def test_incorrect_model_type(self):
        with self.assertRaises(ValueError) as error:
            naw.WordEmbsAug(
                model_type='test_model_type',
                model_path=self.word2vec_model_path)

        self.assertTrue('Model type value is unexpected.' in str(error.exception))
def augment_dataset(csv, model_dir):

    original = pd.read_csv(csv)
    """
    Conduct two process of augmentation
    1. Synonym augmentation
    2. Word Embedding augmemntation
    """

    syn_df = original.copy()
    syn_aug = naw.SynonymAug(aug_src='wordnet')

    # synonym augenter(simple version)
    for i, query in enumerate(syn_df.src):
        synonym = syn_aug.augment(query)
        syn_df.at[i, 'src'] = synonym

    #word embedding augmenter
    word_df = original.copy()
    embed_aug = naw.WordEmbsAug(model_type='fasttext',
                                model_path=model_dir +
                                '/wiki-news-300d-1M.vec',
                                action="insert")

    for i, query in enumerate(word_df.src):
        insertion = embed_aug.augment(query)
        word_df.at[i, 'src'] = insertion

    a1 = pd.catcat([original, syn_df])
    a2 = pd.concat([a1, word_df])

    a2.to_csv(os.path.join(model_dir, 'augmented.csv'), index=False)

    return a2
示例#6
0
def embeddings(df, model, context_action):
    aug = naw.WordEmbsAug(model_type=model,
                          model_path=r"PATH TO WORD2VEC MODEL",
                          action=context_action)

    print("STARTING EMBEDDINGS: ", context_action)
    text_augmentation(aug, df)
示例#7
0
def augmentation(text, insert=False, substitute=False, swap=True, delete=True):
    augs = []

    if insert:
        aug = naw.WordEmbsAug(
            model_type='word2vec',
            model_path=
            '/media/jionie/my_disk/Kaggle/Tweet/model/word2vec/GoogleNews-vectors-negative300.bin',
            action="insert")
        augs.append(aug)

    if substitute:
        aug_sub = naw.SynonymAug(aug_src='wordnet')
        augs.append(aug_sub)

    if swap:
        aug_swap = naw.RandomWordAug(action="swap")
        augs.append(aug_swap)

    if delete:
        aug_del = naw.RandomWordAug()
        augs.append(aug_del)

    aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
    # print("before aug:", text)
    text = aug.augment(text, n=1)
    # print("after aug:", text)

    return text
示例#8
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog'
        n = 3

        w2v_model_path = os.path.join(os.environ["MODEL_DIR"], 'word',
                                      'word_embs',
                                      'GoogleNews-vectors-negative300.bin')

        flows = [
            naf.Sequential([
                naf.Sequential([
                    nac.OcrAug(),
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ]),
                naf.Sequential([
                    nac.RandomCharAug(),
                ]),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ]),
            naf.Sometimes([
                naf.Sequential([
                    nac.OcrAug(),
                    nac.RandomCharAug(),
                ]),
                naf.Sometimes([
                    naw.WordEmbsAug(model_type='word2vec',
                                    model_path=w2v_model_path)
                ],
                              pipeline_p=0.999),
                naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                          action="substitute",
                                          temperature=0.7,
                                          device='cpu')
            ],
                          pipeline_p=0.9999)
        ]

        for num_thread in [1, 3]:
            for flow in flows:
                augmented_data = flow.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
示例#9
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.insert_augmenters = [
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ.get("MODEL_DIR") +
                            'GoogleNews-vectors-negative300.bin',
                            action=Action.INSERT),
            naw.WordEmbsAug(model_type='glove',
                            model_path=os.environ.get("MODEL_DIR") +
                            'glove.6B.50d.txt',
                            action=Action.INSERT),
            naw.WordEmbsAug(model_type='fasttext',
                            model_path=os.environ.get("MODEL_DIR") +
                            'wiki-news-300d-1M.vec',
                            action=Action.INSERT),
        ]

        cls.substitute_augmenters = [
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ.get("MODEL_DIR") +
                            'GoogleNews-vectors-negative300.bin',
                            action=Action.SUBSTITUTE),
            naw.WordEmbsAug(model_type='glove',
                            model_path=os.environ.get("MODEL_DIR") +
                            'glove.6B.50d.txt',
                            action=Action.SUBSTITUTE),
            naw.WordEmbsAug(model_type='fasttext',
                            model_path=os.environ.get("MODEL_DIR") +
                            'wiki-news-300d-1M.vec',
                            action=Action.SUBSTITUTE),
        ]
示例#10
0
    def test_case_insensitive(self):
        retry_cnt = 10

        text = 'Good'
        aug = naw.WordEmbsAug(
            model_type='word2vec', model_path=self.word2vec_model_path,
            top_k=2)

        for _ in range(retry_cnt):
            augmented_text = aug.augment(text)
            self.assertNotEqual(text.lower(), augmented_text.lower())

        self.assertLess(0, retry_cnt)
示例#11
0
    def test_case_insensitive(self):
        retry_cnt = 10

        text = 'Good'
        aug = naw.WordEmbsAug(model_type='word2vec',
                              model_path=os.environ.get("MODEL_DIR") +
                              'GoogleNews-vectors-negative300.bin',
                              top_k=2)

        for _ in range(retry_cnt):
            augmented_text = aug.augment(text)
            self.assertNotEqual(text.lower(), augmented_text.lower())

        self.assertLess(0, retry_cnt)
示例#12
0
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            naw.RandomWordAug(action="delete", stopwords=stopwords),
            naw.ContextualWordEmbsAug(stopwords=stopwords),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path, stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
示例#13
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        cls.word2vec_model_path = os.path.join(
            os.environ.get("MODEL_DIR"), 'word', 'word_embs',
            'GoogleNews-vectors-negative300.bin')
        cls.word2vec_model = naw.WordEmbsAug(
            model_type='word2vec', model_path=cls.word2vec_model_path)
        cls.context_word_embs_model = naw.ContextualWordEmbsAug()

        cls.tfidf_model_path = os.path.join(os.environ.get("MODEL_DIR"),
                                            'word', 'tfidf')

        cls._train_tfidf(cls)
示例#14
0
    def test_stopwords_regex(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords_regex = "( [a-zA-Z]{1}ox | [a-z]{1}og|(brown)|[a-zA-z]{1}he)|[a-z]{2}mps "

        augs = [
            naw.RandomWordAug(action="delete", stopwords_regex=stopwords_regex),
            naw.ContextualWordEmbsAug(stopwords_regex=stopwords_regex),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path,
                            stopwords_regex=stopwords_regex)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
示例#15
0
    def test_stopwords(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        stopwords = ['The', 'brown', 'fox', 'jumps', 'the', 'dog']

        augs = [
            naw.RandomWordAug(action="delete", stopwords=stopwords),
            naw.ContextualWordEmbsAug(stopwords=stopwords),
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ["MODEL_DIR"] + 'GoogleNews-vectors-negative300.bin',
                            stopwords=stopwords)
        ]

        for aug in augs:
            for i in range(10):
                augmented_text = aug.augment(text)
                self.assertTrue(
                    'quick' not in augmented_text or 'over' not in augmented_text or 'lazy' not in augmented_text)
示例#16
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        augs = [
            naw.RandomWordAug(),
            naw.WordEmbsAug(model_type='word2vec', model_path=self.word2vec_model_path),
            naw.ContextualWordEmbsAug(
                model_path='xlnet-base-cased', action="substitute", device='cpu')
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=num_thread, num_thread=num_thread)
                if num_thread == 1:
                    # return string
                    self.assertTrue(isinstance(augmented_data, str))
                else:
                    self.assertEqual(len(augmented_data), num_thread)
示例#17
0
    def augmentation(self,
                     text,
                     insert=False,
                     substitute=False,
                     swap=True,
                     delete=True):

        augs = []

        if insert:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="insert", device='cuda')
            # wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
            aug = naw.WordEmbsAug(
                model_type='word2vec',
                model_path=
                '/C:/Users/admin/Documents/Nitin/mycodes/kaggle_google_quest_qna/data/helpers/word2vec/GoogleNews-vectors-negative300.bin',
                action="insert")
            augs.append(aug)

        if substitute:
            # aug = naw.ContextualWordEmbsAug(
            #     model_path=self.model_type, action="substitute", device='cuda')
            # aug = naw.WordEmbsAug(
            #     model_type='word2vec', model_path='/media/jionie/my_disk/Kaggle/Google_Quest_Answer/model/word2vec/GoogleNews-vectors-negative300.bin',
            #     action="substitute")
            aug_sub = naw.SynonymAug(aug_src='wordnet')
            augs.append(aug_sub)
            # text = aug.augment(text)

        if swap:
            aug_swap = naw.RandomWordAug(action="swap")
            augs.append(aug_swap)
            # text = aug.augment(text)

        if delete:
            aug_del = naw.RandomWordAug()
            augs.append(aug_del)
            # text = aug.augment(text)

        aug = naf.Sometimes(augs, aug_p=0.5, pipeline_p=0.5)
        # print("before aug:", text)
        text = aug.augment(text, n=1)
        # print("after aug:", text)

        return text
示例#18
0
    def fn_word_emb(self):
        product_choices = list(
            product(self.database_choice, self.action_choices,
                    self.aug_p_choices))
        for item in product_choices:
            aug = naw.WordEmbsAug(model_type=item[0][1],
                                  model_path=item[0][2],
                                  action=item[1],
                                  aug_p=item[2],
                                  stopwords=self.stopwords)
            print("\nmodelname-action-words augmented: {}-{}-{}\n".format(
                item[0][0], item[1], item[2]))
            augmented_text = aug.augment(text, n=self.n_words)
            print(augmented_text, "\n")

            self.write_excel(augmented_text, item, item[1], item[2])
        self.workbook.close()
示例#19
0
    def test_multi_thread(self):
        text = 'The quick brown fox jumps over the lazy dog.'
        n = 3
        augs = [
            naw.RandomWordAug(),
            naw.WordEmbsAug(model_type='word2vec',
                            model_path=os.environ["MODEL_DIR"] +
                            'GoogleNews-vectors-negative300.bin'),
            naw.ContextualWordEmbsAug(model_path='xlnet-base-cased',
                                      action="substitute",
                                      skip_unknown_word=True,
                                      temperature=0.7,
                                      device='cpu')
        ]

        for num_thread in [1, 3]:
            for aug in augs:
                augmented_data = aug.augment(text, n=n, num_thread=num_thread)
                self.assertEqual(len(augmented_data), n)
示例#20
0
import pandas as pd
import numpy as np
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

import swifter

df = pd.read_csv("../data/train.csv").iloc[:, 1:]

aug = naw.WordEmbsAug(model_type="word2vec",
                      model_path="../data/aug/GoogleNews-vectors-negative300",
                      action="insert")

df_copy = df.copy()
# df_copy["description"] = df_copy["description"].apply(lambda x: aug.augment(x))
df_copy["description"] = df_copy["description"].swifter.apply(aug.augment)

print(0)

df_copy.to_csv("../data/train_augmented.csv", index=False)
示例#21
0
 def test_incorrect_model_type(self):
     with self.assertRaises(ValueError):
         naw.WordEmbsAug(model_type='test_model_type',
                         model_path=os.environ.get("MODEL_DIR") +
                         'GoogleNews-vectors-negative300.bin',
                         action=Action.INSERT)
示例#22
0
    def tokenize(self):

        # This method rewrite the TRIZ summary in accordance with the will of the user (only one part of the contradiction in the summary or both parts)
        # It also rewrites every part involved to add spaces between sentences and avoid problems during tokenization
        # Finally it tokenizes text+summary using Stanford Core NLP. The results are saved

        if self.args.mode == 'test':
            print("\n\n\nTokenization in progress...")
            if self.args.input_files is not None:
                files = self.args.input_files
            else:
                addon = '/*' * (self.args.depth_directory + 1)
                files = sorted(
                    glob.glob(self.data_path + addon + self.args.input))

            count = 0
            pool = Pool(self.args.n_cpus)
            for _ in pool.imap_unordered(clean_text, files):
                # Print % processed files
                #########################################################
                sys.stdout.write('\r')
                # the exact output you're looking for:
                j = (count + 1) / len(files)
                sys.stdout.write("[%-20s] %d%%" % ('=' * int(20 * j), 100 * j))
                sys.stdout.flush()
                count += 1
                #########################################################

            pool.close()
            pool.join()

            extracted_patents_dir = os.path.abspath(self.data_path)
            tokenized_patents_dir = os.path.abspath(self.temp_path + '/test/' +
                                                    self.args.input)

            print("Preparing to tokenize %s to %s..." %
                  (extracted_patents_dir, tokenized_patents_dir))

            # make IO list file
            print("Making list of files to tokenize...")
            with open("mapping_for_corenlp.txt", "w") as f:
                for s in files:
                    f.write("%s\n" % (s))
            command = [
                'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
                '-annotators', 'tokenize,ssplit',
                '-ssplit.newlineIsSentenceBreak', 'always', '-filelist',
                'mapping_for_corenlp.txt', '-outputFormat', 'json',
                '-outputDirectory', tokenized_patents_dir
            ]
            print("Tokenizing %i files in %s and saving in %s..." %
                  (len(files), extracted_patents_dir, tokenized_patents_dir))
            subprocess.call(command)

            os.remove("mapping_for_corenlp.txt")

            print("Stanford CoreNLP Tokenizer has finished.")
            print("Successfully finished tokenizing %s to %s.\n" %
                  (extracted_patents_dir, tokenized_patents_dir))

        else:
            pool = Pool(self.args.n_cpus)
            for path in self.data_path:
                corpus_type = path.split('/')[-1]

                patents_directories = sorted(glob.glob(path + '/*'))
                for patent_directory in patents_directories:
                    found_summary = False
                    patent_files = sorted(glob.glob(patent_directory + '/*'))
                    for patent_file in patent_files:
                        if patent_file.find('.SUM') >= 0 and patent_file.find(
                                'SUMMARY') < 0:
                            found_summary = True
                    if not found_summary:
                        with open(
                                patent_directory + '/' +
                                os.path.basename(patent_directory) + '.SUM',
                                'w') as f:
                            f.write('')

                # Data augmentation with double translation
                # Add spaces between sentences in all used texts including summaries to avoid problems during tokenization
                #########################################################################################################################

                to_augment = []
                count_augment = 0

                for tipe in ["SUM"] + self.parts_of_interest:
                    files = sorted(glob.glob(path + '/*/*.' + tipe))
                    count = 0

                    if tipe != "SUM":
                        print()
                        logging.info("Cleaning data...")
                        count = 0

                        for _ in pool.imap_unordered(clean_text, files):
                            # Print % processed files
                            #########################################################
                            sys.stdout.write('\r')
                            # the exact output you're looking for:
                            j = (count + 1) / len(files)
                            sys.stdout.write("[%-20s] %d%%" %
                                             ('=' * int(20 * j), 100 * j))
                            sys.stdout.flush()
                            count += 1
                            #########################################################

                    if self.args.data_augmentation != "None" and corpus_type == 'train':
                        print("\n" + corpus_type +
                              " set: Data augmentation in progress for " +
                              tipe + " parts...")
                    else:
                        print("\n" + corpus_type +
                              " set: Sentences verification for SUM parts...")

                    for num_file, file in enumerate(files):

                        if tipe == "SUM":
                            with open(file, "r", encoding='utf-8') as f:
                                # print(file)
                                data = ''
                                for sentence in f:
                                    if sentence.find("STATE_OF_THE_ART") >= 0:
                                        continue
                                    else:
                                        data += sentence

                                param_sents = data.split("///")
                                if param_sents[0] != 'empty':
                                    to_augment.append(num_file)
                                first_param_sents = param_sents[0].replace(
                                    "\n", "").split("//")
                                if len(param_sents) > 1:
                                    second_param_sents = param_sents[
                                        1].replace("\n", "").split("//")
                                else:
                                    second_param_sents = []

                        # Print % processed files
                        #########################################################
                        sys.stdout.write('\r')
                        # the exact output you're looking for:
                        j = (count + 1) / len(files)
                        sys.stdout.write("[%-20s] %d%%" %
                                         ('=' * int(20 * j), 100 * j))
                        sys.stdout.flush()
                        count += 1
                        #########################################################

                        # Data augmentation with double translation
                        if self.args.data_augmentation != "None" and corpus_type == 'train' and num_file in to_augment:

                            count_augment += 1
                            if self.args.data_augmentation == "transformation" and self.args.transformation_type == "bert_embeddings":
                                aug = naw.ContextualWordEmbsAug(
                                    model_path='bert-base-uncased',
                                    action="substitute")
                            elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "word2vec_embeddings":
                                aug = naw.WordEmbsAug(
                                    model_type='word2vec',
                                    model_path=
                                    './word2vec/GoogleNews-vectors-negative300.bin'
                                )
                            elif self.args.data_augmentation == "transformation" and self.args.transformation_type == "synonyms":
                                aug = naw.SynonymAug()

                            path_augmented_text = file.split('/')
                            path_augmented_text[-2] += 'b'
                            path_augmented_text[-1] = '.'.join([
                                path_augmented_text[-2],
                                path_augmented_text[-1].split('.')[-1]
                            ])
                            path_new_directory = '/'.join(
                                path_augmented_text[:-1])
                            path_augmented_text = '/'.join(path_augmented_text)

                            if os.path.isfile(
                                    path_augmented_text) or file.find('b') > 0:
                                continue

                            if self.args.data_augmentation == "translation":
                                # Not to exceed google translations quotas
                                time.sleep(1.25)

                            augmented_text = ''
                            if tipe != "SUM":
                                for sentence in data.split('.'):
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '.'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(".",
                                                                    " ") + '.'
                                        # print("ok2")
                            elif first_param_sents[0] != 'empty':
                                for sentence in first_param_sents:
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '. //'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(
                                                ".", " ") + '. //'
                                        # print("ok3")
                                augmented_text += '/'
                                for sentence in second_param_sents:
                                    if self.args.data_augmentation == "translation":
                                        augmented_text += translate_client.translate(
                                            translate_client.translate(
                                                sentence + '.',
                                                target_language=self.args.
                                                translation_language)
                                            ['translatedText'].replace(
                                                "&#39;", "'").replace(
                                                    ".", " ") + '.',
                                            target_language='en'
                                        )['translatedText'].replace(
                                            "&#39;", "'").replace(".",
                                                                  " ") + '. //'
                                    elif self.args.data_augmentation == "transformation":
                                        augmented_text += aug.augment(
                                            sentence + '.').replace(
                                                ".", " ") + '. //'
                                        # print("ok4")
                                augmented_text = augmented_text[:-3]

                            augmented_text = augmented_text.replace(".", ". ")
                            augmented_text = ' '.join(augmented_text.split())

                            # Write translation
                            try:
                                os.mkdir(path_new_directory)
                            except:
                                pass

                            with open(path_augmented_text, 'w') as f:
                                f.write(augmented_text[:-1])

                #########################################################################################################################
                # Rewriting of summaries with chosen sentences/parameters (one side of the contradiction or both)
                #########################################################################################################################""
                data_analyzer = summary_preparation(path + '/')

                (path_state_of_the_art,
                 summary) = data_analyzer.get_data('both')
                for num in range(0, len(summary[0])):
                    summary_patent_first = ''
                    summary_patent_second = ''
                    for x in range(0, len(summary[0][num])):
                        summary_patent_first += (summary[0][num][x] + ' ')
                    with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ',
                              "w") as file:
                        file.write(summary_patent_first)

                    for x in range(0, len(summary[1][num])):
                        summary_patent_second += (summary[1][num][x] + ' ')
                    with open(path_state_of_the_art[num][0:-16] + 'SUMTRIZ2',
                              "w") as file:
                        file.write(summary_patent_second)

                # except:
                #     print("No summaries provided for "+corpus_type+' files.')
                #     time.sleep(1)
                #########################################################################################################################""

                # Tokenization using Standford Core NLP
                #########################################################################################################################

                add_on = ["SUMTRIZ", "SUMTRIZ2"]

                for tipe in self.parts_of_interest + add_on:
                    print("\n\n\nTokenization in progress...")
                    files = sorted(glob.glob(path + '/*/*.' + tipe))
                    print(
                        str(len(files)) + " " + tipe + " found for " +
                        corpus_type + " set.")

                    extracted_patents_dir = os.path.abspath(path)
                    tokenized_patents_dir = os.path.abspath(self.temp_path +
                                                            '/' + corpus_type +
                                                            '/' + tipe)

                    print("Preparing to tokenize %s to %s..." %
                          (extracted_patents_dir, tokenized_patents_dir))
                    stories = os.listdir(extracted_patents_dir)
                    # make IO list file
                    print("Making list of files to tokenize...")
                    with open("mapping_for_corenlp.txt", "w") as f:
                        for s in files:
                            f.write("%s\n" % (s))
                    command = [
                        'java', 'edu.stanford.nlp.pipeline.StanfordCoreNLP',
                        '-annotators', 'tokenize,ssplit',
                        '-ssplit.newlineIsSentenceBreak', 'always',
                        '-filelist', 'mapping_for_corenlp.txt',
                        '-outputFormat', 'json', '-outputDirectory',
                        tokenized_patents_dir
                    ]
                    print("Tokenizing %i files in %s and saving in %s..." %
                          (len(stories), extracted_patents_dir,
                           tokenized_patents_dir))
                    subprocess.call(command)

                    os.remove("mapping_for_corenlp.txt")

                #########################################################################################################################

            print("Stanford CoreNLP Tokenizer has finished.")
            print("Successfully finished tokenizing %s to %s.\n" %
                  (extracted_patents_dir, tokenized_patents_dir))

            pool.close()
            pool.join()
示例#23
0
def word2vec_aug(text):
  aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='/spell/leftout/GoogleNews-vectors-negative300.bin',
    action="substitute")
  augmented_text = aug.augment(text)
  return augmented_text
    if answers.get('choice') == 'Plot embedding space - TensorboardX':
        writer = SummaryWriter('tensorboard/embeddings')
        writer.add_embedding(pretrained_embeddings, metadata=dataset.TEXT.vocab.itos, tag='Embedding')
        writer.close()
        print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard')

    if answers.get('choice') == 'More info about dataset':
        dataset.dataset_info()
        dataset.print_dataset_details()

    if answers.get('choice') == 'Evaluate and plot PR curves - TensorboardX':
        model.load_state_dict(torch.load('ezmath-model_83.pt'))
        test_loss, test_acc = train.evaluate_with_pr_plotting(model, test_iterator, criterion, dataset.LABEL.vocab.itos)
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.3f}%')
        print('Remember to run Tensorboard thru: tensorboard --logdir=tensorboard')

    if answers.get('choice') == 'Test Textual Augmenter by word2vec similarity':
        print('Loading Word2Vec model...')
        aug = naw.WordEmbsAug(
            model_type='word2vec', model_path='vector_cache/word2vec_CoNLL17/model.bin',
            action="substitute")
        text = input("Please insert the exercise text to augment: ")
        augmented_text = aug.augment(text)
        print("Original:")
        print(text)
        print("Augmented Text:")
        print(augmented_text)

    answers = inquirer.prompt(questions)
示例#25
0
    def setUpClass(cls):
        env_config_path = os.path.abspath(os.path.join(
            os.path.dirname(__file__), '..', '..', '..', '.env'))
        load_dotenv(env_config_path)

        model_dir = os.environ.get("MODEL_DIR")

        full_test_case = False

        cls.augmenters = [
            naw.WordEmbsAug(model_type='word2vec', model_path=model_dir+'GoogleNews-vectors-negative300.bin'),
            naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.6B.50d.txt'),
            naw.WordEmbsAug(model_type='fasttext', model_path=model_dir + 'wiki-news-300d-1M.vec')
        ]

        if full_test_case:
            cls.augmenters.extend([
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.42B.300d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.840B.300d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.25d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.50d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.100d.txt'),
                naw.WordEmbsAug(model_type='glove', model_path=model_dir+'glove.twitter.27B.200d.txt'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'wiki-news-300d-1M-subword.vec'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M.vec'),
                naw.WordEmbsAug(model_type='fasttext', model_path=model_dir+'crawl-300d-2M-subword.vec'),
            ])
示例#26
0
from tqdm import tqdm

SAMPLE_NUM = 1000
model_dir = 'model/'
des_dir = 'sentence/'
data_need_dir = des_dir + 'need_sentence.csv'
data_novel_dir = des_dir + 'novel_sentence.xlsx'
data_need_aug_dir = des_dir + 'need_aug_sentence.csv'
data_need_all_dir = des_dir + 'need_all_sentence.csv'

data_need = pd.read_csv(data_need_dir, index_col=0)

augs = [
    # Substitute word by word2vec similarity
    naw.WordEmbsAug(model_type='word2vec',
                    model_path=model_dir +
                    'GoogleNews-vectors-negative300.bin',
                    action="substitute"),
    # Substitute word by contextual word embeddings (BERT)
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased',
                              action="substitute"),
    # Substitute word by WordNet's synonym
    naw.SynonymAug(aug_src='wordnet'),
    # Substitute word by PPDB's synonym
    naw.SynonymAug(aug_src='ppdb', model_path=model_dir + 'ppdb-2.0-s-all')
]

trans = BackTranslation(
    url=[
        'translate.google.com',
        #'translate.google.co.kr',
        'translate.google.cn',