Exemplo n.º 1
0
    def process_languages(
        self, lang_executor=None, tok_executor=None, split_executor=None
    ):
        if lang_executor is None:
            lang_executor = LocalExecutor()

        if self.lang_pair is not None:
            self.lang_pair.process_json_and_tok(
                self.keep_comments,
                self.extract_mode,
                lang_executor if tok_executor is None else tok_executor,
            )
        else:
            jobs = [
                lang_executor.submit(
                    lang.process_json_and_tok,
                    self.keep_comments,
                    self.extract_mode,
                    tok_executor,
                )
                for lang in self.langs
            ]
            for job in jobs:
                job.result()

        jobs = [
            lang_executor.submit(
                lang.split_train_test_valid, self.keep_comments, self.test_size
            )
            for lang in self.langs
        ]
        for i, lang in enumerate(self.langs):
            self.sizes[lang.l] = jobs[i].result()
Exemplo n.º 2
0
 def process(self,
             keep_comments,
             tok_executor=None,
             test_size=1000,
             split_executor=None):
     suffix = '.with_comments' if keep_comments else ''
     print(f"{self.l}: process ...")
     self.process_json_and_tok(keep_comments, tok_executor)
     if (all(
             self.folder.joinpath(f'train{suffix}.{n}.tok').is_file()
             for n in range(8))
             and self.folder.joinpath(f'test{suffix}.tok').is_file()
             and self.folder.joinpath(f'valid{suffix}.tok').is_file()):
         print(f"{self.l}: train, test and valid for already exist. ")
         nlines = 8 * \
                  get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok'))
         size_gb = 8 * \
                   self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size
     else:
         print(f"{self.l}: split train, test and valid ... ")
         if split_executor is None:
             split_executor = LocalExecutor()
         job = split_executor.submit(self.split_train_test_valid,
                                     keep_comments, test_size)
         nlines, size_gb = job.result()
     print(
         f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. "
     )
     # nlines, size = self.split_train_test_valid(keep_comments, test_size)
     return nlines, size_gb
Exemplo n.º 3
0
    def extract_functions_and_apply_bpe(self,
                                        lang_executor=None,
                                        function_executor=None,
                                        bpe_executor=None):
        print("extract functions ... ")
        if lang_executor is None:
            lang_executor = LocalExecutor()
        jobs = [
            lang_executor.submit(lang.extract_functions, self.keep_comments,
                                 self.test_size, function_executor)
            for lang in self.langs
        ]
        for job in jobs:
            job.result()

        for split in ['test', 'valid']:
            for f_type in ['functions_standalone', 'functions_class']:
                truncate_files(
                    l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok')
                    for l in self.langs)

        print("apply bpe on train ... ")
        self.apply_bpe(f'train{self.suffix}.[01234567].functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
        print("apply bpe on test and valid ...")
        self.apply_bpe(f'test{self.suffix}.functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
        self.apply_bpe(f'valid{self.suffix}.functions_*.tok',
                       use_vocab=False,
                       executor=bpe_executor)
Exemplo n.º 4
0
 def binarize_for_XLM(self, files_regex, executor=None):
     print(f"binarize {files_regex} ...")
     if executor is None:
         executor = LocalExecutor()
     jobs = []
     for l in self.langs:
         for f in self.folder.glob(f"{l.l}.{files_regex}"):
             if not Path(str(f) + ".pth").is_file():
                 print(f"binarizing {f} ...")
                 jobs.append(executor.submit(binarize_for_XLM_file, f, self.vocab))
     for job in jobs:
         job.result()
Exemplo n.º 5
0
 def process_languages(self,
                       lang_executor=None,
                       tok_executor=None,
                       split_executor=None):
     if lang_executor is None:
         lang_executor = LocalExecutor()
     jobs = [
         lang_executor.submit(lang.process, self.keep_comments,
                              tok_executor, self.test_size, split_executor)
         for lang in self.langs
     ]
     for i, lang in enumerate(self.langs):
         self.sizes[lang.l] = jobs[i].result()
Exemplo n.º 6
0
 def apply_bpe(self, files_regex, use_vocab=False, executor=None):
     vocab = "" if use_vocab is False else self.vocab
     if executor is None:
         executor = LocalExecutor()
     jobs = []
     for l in self.langs:
         for f in l.folder.glob(files_regex):
             out = self.folder.joinpath(f"{l.l}.{f.name}").with_suffix(".bpe")
             if not out.is_file():
                 print(f"apply bpe on {f} ...")
                 jobs.append(
                     executor.submit(apply_bpe_file, f, out, self.codes, vocab)
                 )
     for job in jobs:
         job.result()
Exemplo n.º 7
0
    def extract_functions(self, lang_executor=None, function_executor=None):
        print("extract functions ... ")
        if lang_executor is None:
            lang_executor = LocalExecutor()
        jobs = [
            lang_executor.submit(lang.extract_functions, self.keep_comments,
                                 self.test_size, function_executor)
            for lang in self.langs
        ]
        for job in jobs:
            job.result()

        for split in ['test', 'valid']:
            for f_type in ['functions_standalone', 'functions_class']:
                truncate_files(
                    l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok')
                    for l in self.langs)