def test_rename(self): new_tmp_dir = "testmkdir/" chainerio.makedirs("file://" + new_tmp_dir) src = os.path.join("file://", new_tmp_dir, 'src') dst = os.path.join("file://", new_tmp_dir, 'dst') with chainerio.open(src, 'w') as fp: fp.write('foobar') chainerio.rename(src, dst) with chainerio.open(dst, 'r') as fp: data = fp.read() assert data == 'foobar' assert not chainerio.exists(src) assert chainerio.exists(dst) chainerio.remove(new_tmp_dir, True)
def load_vocabulary(path): # CHAINERIO add with chainerio.open(path, mode='r') as f: # CHAINERIO add end # +2 for UNK and EOS word_ids = {line.strip(): i + 2 for i, line in enumerate(f)} word_ids['<UNK>'] = 0 word_ids['<EOS>'] = 1 return word_ids
def maybe_load(self): self.global_step = None self.f_id = None self.files = None checkpoint = None if chio.exists(self.args.output_dir): model_names = [f for f in chio.list( self.args.output_dir) if f.endswith(".pt.{}".format(self.team))] if len(model_names) != 0: self.args.resume_step = max( [int(x.split( '.pt.{}'.format(self.team))[0].split('_')[1].strip()) for x in model_names]) self.global_step = self.args.resume_step if self.global_step is not None: print("Load from {}".format(os.path.join(self.args.output_dir, "ckpt_{}.pt.{}".format( self.global_step, self.team)))) with chio.open(os.path.join(self.args.output_dir, "ckpt_{}.pt.{}".format( self.global_step, self.team)), "rb") as f: checkpoint = torch.load(f, map_location="cpu") self.model.load_state_dict(checkpoint['model'], strict=False) self.another_model.load_state_dict( checkpoint['another_model'], strict=False) if self.args.phase2: self.global_step -= self.args.phase1_end_step if is_main_process(): print("resume step from ", self.args.resume_step) if self.args.phase2: keys = list(checkpoint['optimizer']['state'].keys()) # Override hyperparameters from Phase 1 for key in keys: checkpoint['optimizer']['state'][key]['step'] = \ self.global_step for iter, item in enumerate( checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter]['t_total'] =\ self.args.max_steps checkpoint['optimizer']['param_groups'][iter]['warmup'] = \ self.args.warmup_proportion checkpoint['optimizer']['param_groups'][iter]['lr'] = \ self.args.learning_rate self.optimizer.load_state_dict(checkpoint['optimizer']) # Restore AMP master parameters self.f_id = checkpoint['files'][0] self.files = checkpoint['files'][1:]
def test_rename(self): new_tmp_dir = tempfile.TemporaryDirectory() try: src = os.path.join("file://", new_tmp_dir.name, 'src') dst = os.path.join("file://", new_tmp_dir.name, 'dst') with chainerio.open(src, 'w') as fp: fp.write('foobar') assert chainerio.exists(src) assert not chainerio.exists(dst) chainerio.rename(src, dst) with chainerio.open(dst, 'r') as fp: data = fp.read() assert data == 'foobar' assert not chainerio.exists(src) assert chainerio.exists(dst) finally: new_tmp_dir.cleanup()
def read_corpus(path, max_size): # CHAINERIO modify with chainerio.open(path, mode='r', encoding='utf-8') as f: # CHAINERIO modify end trees = [] for line in f: line = line.strip() tree = SexpParser(line).parse() trees.append(tree) if max_size and len(trees) >= max_size: break return trees
def test_root_fs_override(self): from pyarrow import hdfs hdfs_tmpfile = "tmpfile_hdfs" hdfs_file_string = "this is a test string for hdfs" conn = hdfs.connect() with conn.open(hdfs_tmpfile, "wb") as f: f.write(hdfs_file_string.encode('utf-8')) chainerio.set_root("hdfs") with chainerio.open(hdfs_tmpfile, "r") as fp: self.assertEqual(fp.read(), hdfs_file_string) # override with full URI with open(__file__, "r") as my_script: with chainerio.open("file://" + __file__, "r") as fp: self.assertEqual(fp.read(), my_script.read()) with chainerio.open(hdfs_tmpfile, "r") as fp: self.assertEqual(fp.read(), hdfs_file_string) conn.delete(hdfs_tmpfile) conn.close()
def load_data(vocabulary, path): n_lines = count_lines(path) bar = progressbar.ProgressBar() data = [] print('loading...: %s' % path) # CHAINERIO add with chainerio.open(path, mode='r') as f: # CHAINERIO add end for line in bar(f, max_value=n_lines): words = line.strip().split() array = numpy.array([vocabulary.get(w, UNK) for w in words], numpy.int32) data.append(array) return data
def count_lines(path): # CHAINERIO add with chainerio.open(path, mode='r') as f: # CHAINERIO add end return sum([1 for _ in f])
# CHAINERIO import import chainerio # CHAINERIO end import numpy as np import matplotlib matplotlib.use('Agg') mushroomsfile = 'mushrooms.csv' # uncomment to use HDFS, remember to put the mushroomsfile to HDFS # chainerio.set_root("hdfs") # CHAINERIO read file start mushroomsdata = chainerio.open(mushroomsfile, 'r') # CHAINERIO read file end data_array = np.genfromtxt(mushroomsdata, delimiter=',', dtype=str, skip_header=1) for col in range(data_array.shape[1]): data_array[:, col] = np.unique(data_array[:, col], return_inverse=True)[1] X = data_array[:, 1:].astype(np.float32) Y = data_array[:, 0].astype(np.int32)[:, None] train, test = datasets.split_dataset_random(datasets.TupleDataset(X, Y), int(data_array.shape[0] * .7))