def split_train_test_valid(self, keep_comments, test_size): suffix = ".with_comments" if keep_comments else "" all_tok = self.folder.joinpath(f"all{suffix}.tok") # select test/valid/train and split train in 8 valid_file = self.folder.joinpath(f"valid{suffix}.tok") test_file = self.folder.joinpath(f"test{suffix}.tok") n_tests = 0 if not valid_file.is_file(): print(f"{self.l}: splitting valid ... ") subprocess.run( f"cat {all_tok} | head -n {test_size} > {valid_file}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) n_tests += test_size if not test_file.is_file(): print(f"{self.l}: splitting test ... ") subprocess.run( f"cat {all_tok} | head -n {2 * test_size} | tail -n {test_size} > {test_file}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) n_tests += test_size if not all( self.folder.joinpath(f"train{suffix}.{n}.tok").is_file() for n in range(8) ): n_lines = get_nlines(all_tok) split_len = int((n_lines - n_tests) / 8) print(f"{self.l}: splitting train ({n_lines}) to ({split_len}) ... ") for n, i in zip(range(8), range(2 * test_size, n_lines, split_len)): subprocess.run( f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len} > {self.folder.joinpath(f'train{suffix}.{n}.tok')}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) n_lines = get_nlines(self.folder.joinpath(f"train{suffix}.0.tok")) size_gb = self.folder.joinpath(f"train{suffix}.0.tok").stat().st_size print(f"{self.l}: Finished splitting train, test and valid.") print(f"{self.l}: train 0 is {n_lines} lines and {size_gb / (1024 ** 3)} Go. ") return n_lines, size_gb
def process(self, keep_comments, tok_executor=None, test_size=1000, split_executor=None): suffix = '.with_comments' if keep_comments else '' print(f"{self.l}: process ...") self.process_json_and_tok(keep_comments, tok_executor) if (all( self.folder.joinpath(f'train{suffix}.{n}.tok').is_file() for n in range(8)) and self.folder.joinpath(f'test{suffix}.tok').is_file() and self.folder.joinpath(f'valid{suffix}.tok').is_file()): print(f"{self.l}: train, test and valid for already exist. ") nlines = 8 * \ get_nlines(self.folder.joinpath(f'train{suffix}.{0}.tok')) size_gb = 8 * \ self.folder.joinpath(f'train{suffix}.{0}.tok').stat().st_size else: print(f"{self.l}: split train, test and valid ... ") if split_executor is None: split_executor = LocalExecutor() job = split_executor.submit(self.split_train_test_valid, keep_comments, test_size) nlines, size_gb = job.result() print( f"{self.l}: train for is {nlines} lines and {size_gb / (1024 ** 3)} Go. " ) # nlines, size = self.split_train_test_valid(keep_comments, test_size) return nlines, size_gb
def split_train_test_valid(self, keep_comments, test_size=1000): suffix = '.with_comments' if keep_comments else '' # split train-test-valid # regroup all_tok = self.folder.joinpath(f'all{suffix}.tok') command = f"cd {self.folder}; cat *[0-4][0-9][0-9]{suffix}.tok > {all_tok}" proc = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, executable='/bin/bash') size_gb = all_tok.stat().st_size n_lines = get_nlines(all_tok) # shuf shuf_file(all_tok) # select test/valid/train and split train in 8 subprocess.run( f"cat {all_tok} | head -n {test_size} > {self.folder.joinpath(f'valid{suffix}.tok')}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) subprocess.run( f"cat {all_tok} | head -n {2 * test_size} | tail -n {test_size} > {self.folder.joinpath(f'test{suffix}.tok')}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #split_len = int((n_lines - 2 * test_size) / 8) #for n, i in zip(range(8), range(2 * test_size, n_lines, split_len)): # subprocess.run(f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len} > {self.folder.joinpath(f'train{suffix}.{n}.tok')}", shell=True, stdout=subprocess.PIPE, # stderr=subprocess.PIPE) split_len = int((n_lines - 2 * test_size) / 1) # Modified by Rakesh for n, i in zip(range(1), range(2 * test_size, n_lines, split_len)): subprocess.run( f"cat {all_tok} | head -n {i + split_len} | tail -n {split_len} > {self.folder.joinpath(f'train{suffix}.tok')}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return n_lines, size_gb