def extract_functions_and_apply_bpe(self, lang_executor=None, function_executor=None, bpe_executor=None): print("extract functions ... ") if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.extract_functions, self.keep_comments, self.test_size, function_executor) for lang in self.langs ] for job in jobs: job.result() for split in ['test', 'valid']: for f_type in ['functions_standalone', 'functions_class']: truncate_files( l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok') for l in self.langs) print("apply bpe on train ... ") self.apply_bpe(f'train{self.suffix}.[01234567].functions_*.tok', use_vocab=False, executor=bpe_executor) print("apply bpe on test and valid ...") self.apply_bpe(f'test{self.suffix}.functions_*.tok', use_vocab=False, executor=bpe_executor) self.apply_bpe(f'valid{self.suffix}.functions_*.tok', use_vocab=False, executor=bpe_executor)
def process(self, keep_comments, tok_executor=None, test_size=1000, split_executor=None): suffix = '.with_comments' if keep_comments else '' print(f"{self.l}: process ...") self.process_json_and_tok(keep_comments, tok_executor) if (all( self.folder.joinpath(f'train{suffix}.{n}.tok').is_file() for n in range(8)) and self.folder.joinpath(f'test{suffix}.tok').is_file() and self.folder.joinpath(f'valid{suffix}.tok').is_file()): print(f"{self.l}: train, test and valid for already exist. ") nlines = 8 * \ get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok')) size_gb = 8 * \ self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size else: print(f"{self.l}: split train, test and valid ... ") if split_executor is None: split_executor = LocalExecutor() job = split_executor.submit(self.split_train_test_valid, keep_comments, test_size) nlines, size_gb = job.result() print( f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. " ) # nlines, size = self.split_train_test_valid(keep_comments, test_size) return nlines, size_gb
def extract_docstrings(self, keep_comments, test_size=1000, executor=None): if executor is None: executor = LocalExecutor() suffix = '.with_comments' if keep_comments else '' files = list( self.folder.glob(f'train{suffix}.[01234567].functions_class.tok')) files += list( self.folder.glob( f'train{suffix}.[01234567].functions_standalone.tok')) files.append(self.folder.joinpath(f'test{suffix}.functions_class.tok')) files.append( self.folder.joinpath(f'test{suffix}.functions_standalone.tok')) files.append( self.folder.joinpath(f'valid{suffix}.functions_class.tok')) files.append( self.folder.joinpath(f'valid{suffix}.functions_standalone.tok')) toks = [ tok for tok in files if not (tok.with_suffix('.DS-f.ds.tok').is_file() and tok.with_suffix('.DS-f.f.tok').is_file()) ] if len(toks) > 0: jobs = executor.map_array(extract_docstrings, toks, itertools.repeat(self.l)) for job in jobs: job.result()
def process_languages( self, lang_executor=None, tok_executor=None, split_executor=None ): if lang_executor is None: lang_executor = LocalExecutor() if self.lang_pair is not None: self.lang_pair.process_json_and_tok( self.keep_comments, self.extract_mode, lang_executor if tok_executor is None else tok_executor, ) else: jobs = [ lang_executor.submit( lang.process_json_and_tok, self.keep_comments, self.extract_mode, tok_executor, ) for lang in self.langs ] for job in jobs: job.result() jobs = [ lang_executor.submit( lang.split_train_test_valid, self.keep_comments, self.test_size ) for lang in self.langs ] for i, lang in enumerate(self.langs): self.sizes[lang.l] = jobs[i].result()
def binarize_for_XLM(self, files_regex, executor=None): print(f"binarize {files_regex} ...") if executor is None: executor = LocalExecutor() jobs = [] for l in self.langs: for f in self.folder.glob(f"{l.l}.{files_regex}"): if not Path(str(f) + ".pth").is_file(): print(f"binarizing {f} ...") jobs.append(executor.submit(binarize_for_XLM_file, f, self.vocab)) for job in jobs: job.result()
def process_languages(self, lang_executor=None, tok_executor=None, split_executor=None): if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.process, self.keep_comments, tok_executor, self.test_size, split_executor) for lang in self.langs ] for i, lang in enumerate(self.langs): self.sizes[lang.l] = jobs[i].result()
def apply_bpe(self, files_regex, use_vocab=False, executor=None): vocab = "" if use_vocab is False else self.vocab if executor is None: executor = LocalExecutor() jobs = [] for l in self.langs: for f in l.folder.glob(files_regex): out = self.folder.joinpath(f"{l.l}.{f.name}").with_suffix(".bpe") if not out.is_file(): print(f"apply bpe on {f} ...") jobs.append( executor.submit(apply_bpe_file, f, out, self.codes, vocab) ) for job in jobs: job.result()
def extract_functions(self, lang_executor=None, function_executor=None): print("extract functions ... ") if lang_executor is None: lang_executor = LocalExecutor() jobs = [ lang_executor.submit(lang.extract_functions, self.keep_comments, self.test_size, function_executor) for lang in self.langs ] for job in jobs: job.result() for split in ['test', 'valid']: for f_type in ['functions_standalone', 'functions_class']: truncate_files( l.folder.joinpath(f'{split}{self.suffix}.{f_type}.tok') for l in self.langs)
def process_json_and_tok(self, keep_comments, extract_mode, executor=None): if executor is None: executor = LocalExecutor() suffix = ".with_comments" if keep_comments else "" jsons = list(self.folder.glob("*.[0-9][0-9][0-9].json.gz")) assert ( len(jsons) > 0 ), f"there is no *.[0-9][0-9][0-9].json.gz in {str(self.folder)}" jsons = [ json for json in jsons if not json.with_suffix("").with_suffix(suffix + ".tok.json").is_file() ] print(f"{self.lang1}-{self.lang2}: processing {len(jsons)} json files ...") if len(jsons) > 0: jobs = map_array( executor, process_language_pair_json, jsons, itertools.repeat(self.lang1), itertools.repeat(self.lang2), itertools.repeat(keep_comments), itertools.repeat(extract_mode), ) for job in jobs: job.result() # join all_tok = self.folder.joinpath(f"all{suffix}.tok.json") if not all_tok.is_file(): command = ( f"cd {self.folder}; cat *.[0-9][0-9][0-9]{suffix}.tok.json > {all_tok}" ) proc = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable="/bin/bash", ) # shuf shuf_file(all_tok) # extract to language toks jobs = map_array( executor, select_toks_json, [self.lang1, self.lang2], itertools.repeat(all_tok), [ self.folder_lang1.joinpath(f"all{suffix}.tok"), self.folder_lang2.joinpath(f"all{suffix}.tok"), ], ) for job in jobs: job.result()
def process_json_and_tok(self, keep_comments, executor=None): if executor is None: executor = LocalExecutor() suffix = '.with_comments' if keep_comments else '' assert len(list(self.folder.glob( '*.json.gz'))) > 0, f"there is no json in {str(self.folder)}" jsons = [ json for json in self.folder.glob('*.json.gz') if not Path(str(json).replace('.json.gz', suffix + '.tok')).is_file() ] print(f"{self.l}: tokenizing {len(jsons)} json files ...") if len(jsons) > 0: jobs = executor.map_array(process_and_tokenize_json_file, jsons, itertools.repeat(self.l), itertools.repeat(keep_comments)) for job in jobs: job.result() else: return
def process_json_and_tok(self, keep_comments, extract_mode, executor=None): print(f"{self.l}: process ...") if executor is None: executor = LocalExecutor() suffix = ".with_comments" if keep_comments else "" assert ( len(list(self.folder.glob("*.json.gz"))) > 0 ), f"there is no json in {str(self.folder)}" jsons = [ json for json in self.folder.glob("*.json.gz") if not Path(str(json).replace(".json.gz", suffix + ".tok")).is_file() ] print(f"{self.l}: tokenizing {len(jsons)} json files ...") if len(jsons) > 0: jobs = map_array( executor, process_and_tokenize_json_file, jsons, itertools.repeat(self.l), itertools.repeat(keep_comments), itertools.repeat(extract_mode), ) for job in jobs: job.result() # join all_tok = self.folder.joinpath(f"all{suffix}.tok") if not all_tok.is_file(): command = f"cd {self.folder}; cat *.[0-9][0-9][0-9]{suffix}.tok > {all_tok}" proc = subprocess.run( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable="/bin/bash", ) # shuf shuf_file(all_tok)