Exemplo n.º 1
0
 def extract_docstrings(self, keep_comments, test_size=1000, executor=None):
     if executor is None:
         executor = LocalExecutor()
     suffix = '.with_comments' if keep_comments else ''
     files = list(
         self.folder.glob(f'train{suffix}.[01234567].functions_class.tok'))
     files += list(
         self.folder.glob(
             f'train{suffix}.[01234567].functions_standalone.tok'))
     files.append(self.folder.joinpath(f'test{suffix}.functions_class.tok'))
     files.append(
         self.folder.joinpath(f'test{suffix}.functions_standalone.tok'))
     files.append(
         self.folder.joinpath(f'valid{suffix}.functions_class.tok'))
     files.append(
         self.folder.joinpath(f'valid{suffix}.functions_standalone.tok'))
     toks = [
         tok for tok in files
         if not (tok.with_suffix('.DS-f.ds.tok').is_file()
                 and tok.with_suffix('.DS-f.f.tok').is_file())
     ]
     if len(toks) > 0:
         jobs = executor.map_array(extract_docstrings, toks,
                                   itertools.repeat(self.l))
         for job in jobs:
             job.result()
Exemplo n.º 2
0
 def process_json_and_tok(self, keep_comments, executor=None):
     if executor is None:
         executor = LocalExecutor()
     suffix = '.with_comments' if keep_comments else ''
     assert len(list(self.folder.glob(
         '*.json.gz'))) > 0, f"there is no json in {str(self.folder)}"
     jsons = [
         json for json in self.folder.glob('*.json.gz')
         if not Path(str(json).replace('.json.gz', suffix +
                                       '.tok')).is_file()
     ]
     print(f"{self.l}: tokenizing {len(jsons)} json files ...")
     if len(jsons) > 0:
         jobs = executor.map_array(process_and_tokenize_json_file, jsons,
                                   itertools.repeat(self.l),
                                   itertools.repeat(keep_comments))
         for job in jobs:
             job.result()
     else:
         return