def createWebApp(cls, name, url): """Create a new application by name and url.""" if len(name.strip()) == 0: raise ValueError("Name is empty!") import BeautifulSoup soup = BeautifulSoup.BeautifulSoup( urllib2.urlopen( urllib2.Request( url, headers={'User-Agent': "Mozilla/5.0 (X11; U; Linux i686) " + "Gecko/20071127 Firefox/2.0.0.11" }))) appJson = { 'uuid': str(uuid.uuid4()), 'name': name, 'url': url, 'icon': 'icon.png', 'size': [800, 600], } icon_url = soup.find("link", rel="apple-touch-icon") if icon_url is None: icon_url = soup.find("link", rel="shortcut icon") appJson['icon'] = 'icon.ico' if icon_url: appJson['icon-url'] = icon_url['href'] app_dir = cls.get_local_apps_dir(appJson['uuid']) utils.ensure_dir_exists(app_dir) with open("%s/%s" % (app_dir, 'app.json'), 'w') as f: json.dump(appJson, f, ensure_ascii=False, indent=4)
def save_current_plot_to_file(self, plot_fn): import utils import os.path utils.ensure_dir_exists(os.path.dirname(plot_fn)) import matplotlib.pyplot as plt plt.savefig(plot_fn, format='pdf') print "Saved plot to %s" % (plot_fn)
def make_all_zips(count=50, target_dir='zips', pool_size=cpu_count()): ensure_dir_exists(target_dir) filenames = [ os.path.join(target_dir, '%d.zip' % index) for index in range(count) ] with ProcessPoolExecutor(pool_size) as executor: return executor.map(make_zip, filenames)
def save_current_plot(graph_name, plot_name, method_name, res_num): import utils import os.path import matplotlib.pyplot as plt plot_dir = "plots/%s" % (graph_name) plot_fn = "%s/%s_%s_r%d.pdf" % (plot_dir, plot_name, method_name, res_num) utils.ensure_dir_exists(os.path.dirname(plot_fn)) plt.savefig(plot_fn, format="pdf") print "Saved plot to %s" % (plot_fn)
def get_user_file(self, file_name, make_dir=True, touch_file=False): user_dir = self.get_user_dir() full_path = os.path.join(user_dir, file_name) if make_dir: full_dir = os.path.dirname(full_path) utils.ensure_dir_exists(full_dir) if touch_file and not os.path.exists(full_path): logger.info("Creating file %s" % full_path) os.mknod(full_path) return full_path
def make_csvs(source_dir='zips', target_dir='csvs', pool_size=cpu_count()): ensure_dir_exists(target_dir) zip_filenames = glob(os.path.join(source_dir, '*.zip')) f1, roots = open_csv_writer(os.path.join(target_dir, 'roots.csv')) f2, objects = open_csv_writer(os.path.join(target_dir, 'objects.csv')) try: roots.writerow(('id', 'level')) objects.writerow(('id', 'object_name')) with ProcessPoolExecutor(pool_size) as executor: ngdata_per_zip = executor.map(extract_ngdata_from_zip, zip_filenames) for ngdata in iflatten(ngdata_per_zip): roots.writerow((ngdata.id, ngdata.level)) for object_name in ngdata.object_names: objects.writerow((ngdata.id, object_name)) finally: f1.close() f2.close()
poison_rates = [ float(x) for x in args.poison_rates.replace(" ", "").split(',') ] assert len(poison_rates) > 0, 'Provide at least one theta value' args.poison_rates = poison_rates except ValueError: raise ValueError("Theta values not provided in correct format") # Before saving anything, make sure directory exists model_dir = os.path.join( args.dir_prefix, "{}/target/" "arch-{}_target-{}_goal-{}_" "rule-{}/loss-{}".format(args.dataset, args.model_arch, args.poison_class, args.attacker_goal, args.c_rule, args.loss)) utils.ensure_dir_exists(model_dir) og_save_poisoned_data = args.save_poisoned_data best_model_obj, best_loss = None, np.inf for ratio in poison_rates: if args.low_confidence and ratio > 1: raise ValueError("Highest-loss selection with ratio > 1 " "makes no sense") # Make sure data is saved if og_save_poisoned_data is not None: args.save_poisoned_data = os.path.join( og_save_poisoned_data, "seed_{}/ratio_{}".format(args.seed, ratio)) utils.ensure_dir_exists(args.save_poisoned_data)
def get_level_path(self, level): path = self.base_path + "/" + self.get_level_textual(level) + ".log" utils.ensure_dir_exists(self.base_path) return path
def run(sess, f, data): # load data that will be used for evaluating the distillation process eval_data = d.get(f.eval_dataset, f) # load teacher graph _, output_size = data.io_shape inputs, teacher_outputs, _, teacher_feed_dicts = m.get(f.model).load_model(sess, f.model_meta, f.model_checkpoint, output_size) teacher_outputs = tf.stop_gradient(tf.nn.softmax(teacher_outputs)) # create student graph outputs, _, feed_dicts = m.get(f.model).create_model(inputs, output_size) loss, train_step = create_train_ops(outputs, teacher_outputs, lr=f.lr, loss=f.loss) accuracy = create_eval_ops(outputs, teacher_outputs) summary_op = create_summary_ops(loss, accuracy) # only initialize non-initialized vars: u.init_uninitted_vars(sess) # (this is very important in distill: we don't want to reinit teacher model) saver = tf.train.Saver(tf.global_variables()) summary_dir = os.path.join(f.summary_folder, f.run_name, 'distill') train_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train'), sess.graph) trainbatch_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train_batch'), sess.graph) test_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'test'), sess.graph) with sess.as_default(): global_step = 0 print('Note: accuracies here are how much the student correlates to the teacher.]') print('For true set accuracy, multiply by teacher\'s accuracy.') for i in range(f.epochs): print('Epoch: {}'.format(i)) for batch_x, _ in data.train_epoch_in_batches(f.train_batch_size): # train step. we don't need to feed batch_y because the student # is being trained to mimic the teacher's temperature-scaled # activations. summary, _ = sess.run([summary_op, train_step], feed_dict={**teacher_feed_dicts['distill'], **feed_dicts['distill'], inputs: batch_x}) trainbatch_writer.add_summary(summary, global_step) if global_step % f.eval_interval == 0: # eval test summaries = [] for test_batch_x, test_batch_y in eval_data.test_epoch_in_batches(f.test_batch_size): summary = sess.run(summary_op, feed_dict={**teacher_feed_dicts['distill'], **feed_dicts['distill'], inputs: test_batch_x}) summaries.append(summary) test_writer.add_summary(merge_summary_list(summaries, True), global_step) # eval train summaries = [] for train_batch_x, train_batch_y in data.train_epoch_in_batches(f.train_batch_size): summary = sess.run(summary_op, feed_dict={**teacher_feed_dicts['distill'], **feed_dicts['distill'], inputs: train_batch_x}) summaries.append(summary) train_writer.add_summary(merge_summary_list(summaries, True), global_step) global_step += 1 if global_step % f.checkpoint_interval == 0: checkpoint_dir = os.path.join(summary_dir, 'checkpoint/') ensure_dir_exists(checkpoint_dir) checkpoint_file = os.path.join(checkpoint_dir, f.model) saver.save(sess, checkpoint_file, global_step=global_step) print('distilled model saved in {}'.format(checkpoint_file)) print('distilled model saved in {}'.format(checkpoint_file))
import sys sys.path.append('../12net') import numpy as np import os, cv2 import numpy.random as npr from utils import IOU, ensure_dir_exists anno_file = 'wider_face_train.txt' img_dir = 'WIDER_train/images' pos_save_dir = '../12net/12/positive' part_save_dir = '../12net/12/part' neg_save_dir = '../12net/12/negative' save_dir = '../12net/12' ensure_dir_exists(save_dir) ensure_dir_exists(pos_save_dir) ensure_dir_exists(part_save_dir) ensure_dir_exists(neg_save_dir) f1 = open(os.path.join(save_dir, 'pos_12.txt'), 'w') f2 = open(os.path.join(save_dir, 'neg_12.txt'), 'w') f3 = open(os.path.join(save_dir, 'part_12.txt'), 'w') with open(anno_file, 'r') as f: annotations = f.readlines() num = len(annotations) print '%d pics in total' % num p_idx = 0
def _rsync_dir(source_dir, dest_dir): ensure_dir_exists(dest_dir) with open('.rsync_log', 'ab') as rsync_log: subprocess.call(['gsutil', '-m', 'rsync', source_dir, dest_dir], stderr=rsync_log)
def ensure_query_file(qfile: str): ''' Ensure that the file exists otherwise creates it. This file will contain an aggregation of all the queries executed against the inverted index. ''' if os.path.isfile(qfile): return with open(qfile, mode='w') as f: f.write("{}") # because we want utils.load_json_from_disk to load an empty dict instead of raising an exception if __name__ == '__main__': args = init_params() utils.ensure_dir_exists('output') ensure_query_file(args.output_file) if args.query_string == None or type(args.query_string) != str: print("Please provide a query str with flag -q") exit() inv_index: Dict[str, Tuple[int, List[int]]] = utils.load_index(args.input_file) result: dict = exec_query(args.query_string, inv_index) x = result[args.query_string] print(f'<{args.query_string}> query was {x["message"]}: {x["frequency"]} hits found') queries: dict = utils.load_json_from_disk(args.output_file) del x['message']
action="store_true", help='If true, print per-epoch training statistics') args = parser.parse_args() if args.verbose: args.verbose_pretrain = True try: wanted_errors = [float(x) for x in args.errors.split(",")] print(utils.red_print("Target error rates: %s" % str(wanted_errors))) except ValueError: raise ValueError("Wanted errors provided in invalid format") # Ensure directory exists where model will be saved utils.ensure_dir_exists(args.save_dir) # Print all arguments utils.flash_utils(args) # Prepare logger log_dir = os.path.join( args.log_path, "indiscriminate_" + str(args.n_copies) + "_" + str(args.seed)) utils.ensure_dir_exists(log_dir) logger = SummaryWriter(log_dir=log_dir, flush_secs=10) print(utils.pink_print("Running attack")) indiscriminateAttack(logger, wanted_errors, args) # Close logger
def test(**kwargs): params = Params(kwargs) print('Params:') params.pretty_print() print() use_cuda = params.use_cuda if use_cuda: assert torch.cuda.is_available() with Timer('Loading models'): gen_a_to_b, gen_b_to_a = load_models_for_evaluation( params.checkpoint_path) print('#weights in gen_a_to_b:', natural.number.number(model_utils.compute_num_weights(gen_a_to_b))) print('#weights in gen_b_to_a:', natural.number.number(model_utils.compute_num_weights(gen_b_to_a))) if use_cuda: gen_a_to_b.cuda() gen_b_to_a.cuda() a_to_b_save_path = join_path(params.test_save_path, c.A_TO_B_GEN_TEST_DIR) b_to_a_save_path = join_path(params.test_save_path, c.B_TO_A_GEN_TEST_DIR) ensure_dir_exists(a_to_b_save_path) ensure_dir_exists(b_to_a_save_path) filenames = utils.listdir(params.dataset_a, extensions=('.png', '.jpg')) for filename in tqdm(filenames, desc='A to B'): filepath = join_path(params.dataset_a, filename) a = image_utils.load_image(filepath) b_fake = generate_fake_image(image=a, generator_net=gen_a_to_b, use_cuda=use_cuda) root, ext = os.path.splitext(filename) a_filepath = join_path(a_to_b_save_path, '{}-a{}'.format(root, ext)) skimage.io.imsave(a_filepath, a) a_to_b_filepath = join_path(a_to_b_save_path, '{}-a-to-b{}'.format(root, ext)) skimage.io.imsave(a_to_b_filepath, b_fake) filenames = utils.listdir(params.dataset_b, extensions=('.png', '.jpg')) for filename in tqdm(filenames, desc='B to A'): filepath = join_path(params.dataset_b, filename) b = image_utils.load_image(filepath) a_fake = generate_fake_image(image=b, generator_net=gen_b_to_a, use_cuda=use_cuda) root, ext = os.path.splitext(filename) b_filepath = join_path(b_to_a_save_path, '{}-b{}'.format(root, ext)) skimage.io.imsave(b_filepath, b) b_to_a_filepath = join_path(b_to_a_save_path, '{}-b-to-a{}'.format(root, ext)) skimage.io.imsave(b_to_a_filepath, a_fake)
split_1 = ch.cat(split_1) split_2 = ch.cat(split_2) data_first = (X[split_1], Y[split_1]) data_second = (X[split_2], Y[split_2]) return data_first, data_second if __name__ == "__main__": mnist17 = datasets.dataset_helper("mnist17")() train_1, train_2 = stratified_split(mnist17.train) val_1, val_2 = stratified_split(mnist17.val) # Ensure directory exists utils.ensure_dir_exists("./data/datasets/MNIST17/") # Save these files ch.save( { "train": { "data": train_1[0], "targets": train_1[1] }, "val": { "data": val_1[0], "targets": val_1[1] }, }, "./data/datasets/MNIST17/split_1.pt") ch.save(
from collections import Counter import nltk import os from tqdm import tqdm import utils from utils import CACHE_DIR TOKENIZING_REGEX = r"[a-zA-Z]+[-']{0,1}[a-zA-Z]*[']{0,1}" # supports hyphenated and apostrophied words utils.ensure_dir_exists(CACHE_DIR) def custom_tokenizer(text: str, tokenizer, stem_doc=False) -> list: tokens = tokenizer.tokenize(text) stemmer = nltk.stem.PorterStemmer() cleaned_tokens = [] for token in tokens: if token == "": continue while token[-1] == "-" or token[-1] == "'": token = token[:-1] if 1 < len( token ): # don't bother adding single letters to the index because the smallest lastnames should be >= 2 letters token = token.lower() if stem_doc:
from loader import generate_train_data from prototypes import wavenet from utils import ensure_dir_exists if __name__ == '__main__': input_length = 4000 epochs = 500 ensure_dir_exists('models/') model = wavenet(input_length) for e in range(epochs): train_batches = generate_train_data(input_length, 1000) print("Epoch {}/{}:".format(e + 1, epochs)) for i, (x, y) in enumerate(train_batches): model.fit(x, y, batch_size=4, epochs=1, verbose=2) if (e + 1) % 50 == 0: print("Saving intermediate model weights...") model.save_weights('models/tmp.h5') print("\nTraining complete!\nSaving model...") model.save_weights('models/final_weights.h5') print("Model saved, terminate.")
def train(**kwargs): params = Params(kwargs) print('Params:') params.pretty_print() print() use_cuda = params.use_cuda if use_cuda: assert torch.cuda.is_available() with Timer('Initializing'): a_image_generator = create_image_generator(params.dataset_a) b_image_generator = create_image_generator(params.dataset_b) gen_a_to_b, gen_b_to_a, discr_a, discr_b = load_models_for_training(params.checkpoint_path) print('#weights in gen_a_to_b:', natural.number.number(model_utils.compute_num_weights(gen_a_to_b))) print('#weights in gen_b_to_a:', natural.number.number(model_utils.compute_num_weights(gen_b_to_a))) print('#weights in discr_a:', natural.number.number(model_utils.compute_num_weights(discr_a))) print('#weights in discr_b:', natural.number.number(model_utils.compute_num_weights(discr_b))) if use_cuda: gen_a_to_b.cuda() gen_b_to_a.cuda() discr_a.cuda() discr_b.cuda() betas = (params.adam_beta1, params.adam_beta2) optimizer_generators = torch.optim.Adam(params=itertools.chain(gen_a_to_b.parameters(), gen_b_to_a.parameters()), lr=params.gen_learning_rate, betas=betas) optimizer_discr_a = torch.optim.Adam(params=discr_a.parameters(), lr=params.discr_learning_rate, betas=betas) optimizer_discr_b = torch.optim.Adam(params=discr_b.parameters(), lr=params.discr_learning_rate, betas=betas) cycle_criterion = nn.L1Loss() discr_criterion = nn.MSELoss() one_array = torch.ones((params.batch_size, 1, 30, 30)) # Has the same size as the output of discr_a and discr_b if use_cuda: one_array = one_array.cuda() one_array = Variable(one_array, requires_grad=False) zero_array = torch.zeros((params.batch_size, 1, 30, 30)) if use_cuda: zero_array = zero_array.cuda() zero_array = Variable(zero_array, requires_grad=False) a_fake_image_pool = image_utils.ImagePool(params.image_pool_size) b_fake_image_pool = image_utils.ImagePool(params.image_pool_size) header = '\t'.join(FIELD_NAMES) print(header) # Train: with open(params.log_filename, 'w') as csvfile: dict_writer = csv.DictWriter(csvfile, FIELD_NAMES) dict_writer.writeheader() for i in itertools.count(): timer = Timer('train step', verbose=False) with timer: a = generate_batch_variable(a_image_generator, use_cuda, params.batch_size) b = generate_batch_variable(b_image_generator, use_cuda, params.batch_size) generators_loss = models.compute_generators_loss(gen_a_to_b, gen_b_to_a, discr_a, discr_b, a, b, cycle_criterion, discr_criterion, one_array, a_fake_image_pool, b_fake_image_pool) optimize(optimizer_generators, generators_loss) discr_a_loss = models.compute_discr_loss(discr_a, a, a_fake_image_pool, discr_criterion, zero_array, one_array) optimize(optimizer_discr_a, discr_a_loss) discr_b_loss = models.compute_discr_loss(discr_b, b, b_fake_image_pool, discr_criterion, zero_array, one_array) optimize(optimizer_discr_b, discr_b_loss) row = collections.OrderedDict() row['step'] = str(i) row['generators_loss'] = float_to_string(generators_loss) row['discr_a_loss'] = float_to_string(discr_a_loss) row['discr_b_loss'] = float_to_string(discr_b_loss) row['total_loss'] = float_to_string(generators_loss + discr_a_loss + discr_b_loss) row['duration'] = '{0:.2f}s'.format(timer.get_duration()) dict_writer.writerow(row) print('\t'.join(row.values())) if i % params.save_step == 0 and i > 0: with Timer('Saving models'): ensure_dir_exists(params.checkpoint_path) torch.save(gen_a_to_b.state_dict(), join_path(params.checkpoint_path, c.A_TO_B_GEN_DIR)) torch.save(gen_b_to_a.state_dict(), join_path(params.checkpoint_path, c.B_TO_A_GEN_DIR)) torch.save(discr_a.state_dict(), join_path(params.checkpoint_path, c.A_DISCR_DIR)) torch.save(discr_b.state_dict(), join_path(params.checkpoint_path, c.B_DISCR_DIR)) print(header) if i % params.test_step == 0 and i > 0: ensure_dir_exists(params.debug_path) with Timer('Creating debug images'): a, b, b_fake, a_fake = create_debug_images(a, b, gen_a_to_b, gen_b_to_a, params.use_cuda) a_filepath = join_path(params.debug_path, '{}-a.jpg'.format(i)) skimage.io.imsave(a_filepath, a) b_filepath = join_path(params.debug_path, '{}-b.jpg'.format(i)) skimage.io.imsave(b_filepath, b) b_fake_filepath = join_path(params.debug_path, '{}-a-to-b.jpg'.format(i)) skimage.io.imsave(b_fake_filepath, b_fake) a_fake_filepath = join_path(params.debug_path, '{}-b-to-a.jpg'.format(i)) skimage.io.imsave(a_fake_filepath, a_fake)
def main(unused_argv): """Run the reinforcement learning loop.""" utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.sgf_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) bootstrap_name = shipname.generate(0) bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name) mask_flags.checked_run([ 'python3', 'bootstrap.py', '--export_path={}'.format(bootstrap_model_path), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) selfplay_cmd = [ 'python3', 'selfplay.py', '--load_file={}'.format(bootstrap_model_path), '--selfplay_dir={}'.format( os.path.join(fsdb.selfplay_dir(), bootstrap_name)), '--holdout_dir={}'.format( os.path.join(fsdb.holdout_dir(), bootstrap_name)), '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0', '--flagfile=rl_loop/local_flags' ] # Selfplay twice mask_flags.checked_run(selfplay_cmd) mask_flags.checked_run(selfplay_cmd) # and once more to generate a held out game for validation # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100']) # Double check that at least one sgf has been generated. assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full')) print("Making shuffled golden chunk from selfplay data...") # TODO(amj): refactor example_buffer so it can be called the same way # as everything else. eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(), local_dir=fsdb.working_dir(), game_dir=fsdb.selfplay_dir(), model_num=1, positions=64, threads=8, sampling_frac=1) tf_records = sorted( gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz'))) trained_model_name = shipname.generate(1) trained_model_path = os.path.join(fsdb.models_dir(), trained_model_name) # Train on shuffled game data mask_flags.checked_run([ 'python3', 'train.py', *tf_records, '--work_dir={}'.format(fsdb.working_dir()), '--export_path={}'.format(trained_model_path), '--flagfile=rl_loop/local_flags' ]) # Validate the trained model on held out game mask_flags.checked_run([ 'python3', 'validate.py', os.path.join(fsdb.holdout_dir(), bootstrap_name), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) # Verify that trained model works for selfplay # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--load_file={}'.format(trained_model_path)]) mask_flags.checked_run([ 'python3', 'evaluate.py', bootstrap_model_path, trained_model_path, '--games=1', '--eval_sgf_dir={}'.format(fsdb.eval_dir()), '--flagfile=rl_loop/local_flags' ]) print("Completed integration test!")
def run_game(network, args, device=None, sgf_dir=None, holdout_pct=0.05): '''Takes a played game and record results and game data.''' selfplay_dir = os.path.join(args.selfplay_dir, args.model_name) utils.ensure_dir_exists(selfplay_dir) holdout_dir = os.path.join(args.holdout_dir, args.model_name) utils.ensure_dir_exists(holdout_dir) if args.sgf_dir: sgf_dir = os.path.join(args.sgf_dir, args.model_name) utils.ensure_dir_exists(sgf_dir) if sgf_dir is not None: minimal_sgf_dir = os.path.join(sgf_dir, 'clean') full_sgf_dir = os.path.join(sgf_dir, 'full') utils.ensure_dir_exists(minimal_sgf_dir) utils.ensure_dir_exists(full_sgf_dir) if selfplay_dir is not None: utils.ensure_dir_exists(selfplay_dir) utils.ensure_dir_exists(holdout_dir) with utils.logged_timer("Playing game"): player = play(network, args, device=device) features, pis, values = player.extract_data(return_features=True) features = np.array(features) pis = np.array(pis) values = np.array(values) assert features.shape[0] == pis.shape[0] == values.shape[0] output_name = '{}-{}'.format(int(time.time()), features.shape[0]) if sgf_dir is not None: with open(os.path.join(minimal_sgf_dir, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf(use_comments=False)) with open(os.path.join(full_sgf_dir, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf()) if selfplay_dir is not None: # Hold out 5% of games for validation. if random.random() < holdout_pct: fname = os.path.join(holdout_dir, "{}.hdf5".format(output_name)) else: fname = os.path.join(selfplay_dir, "{}.hdf5".format(output_name)) preprocessing.save_h5_examples(fname, features, pis, values)
def main(unused_argv): """Run the reinforcement learning loop.""" print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) # Copy the target model to the models directory so we can find it easily. shutil.copy('ml_perf/target.pb', fsdb.models_dir()) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'reinforcement.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) with utils.logged_timer('Total time'): rl_loop()
(i + 1, tst_nsub_acc))) print() for valid_theta_err in args.theta_values: args.err_threshold = valid_theta_err # Prepare logger log_dir = os.path.join( args.log_path, str(valid_theta_err) + "_mnist17split" + "_" + args.optim_type + "_" + str(args.model_arch) + "_" + str(args.n_copies) + "_" + str(args.optim_steps) + "_" + str(args.optim_trials) + "_signed=" + str(args.signed) + "_dynamic_n=" + str(args.dynamic_repeat) + "_batch_estimate=" + str(args.batch_sample_estimate) + "_" + str(args.optim_lr) + "_" + str(args.seed)) utils.ensure_dir_exists(log_dir) logger = SummaryWriter(log_dir=log_dir, flush_secs=10) print( utils.pink_print("Running attack for theta %.2f" % valid_theta_err)) # Get poison data poison_data, theta_t = modelTargetPoisoningEnsemble( thetas_p, logger, args) dp_x = ch.cat(poison_data[0], 0).numpy() dp_y = ch.cat(poison_data[1], 0).numpy() # Save this data poisoned_data_dir = os.path.join(os.path.join(log_dir, "poisondata")) np.savez(poisoned_data_dir, x=dp_x, y=dp_y)
sh.setLevel(logging.INFO) logger.addHandler(sh) # Load default values from config file nlp_config = json.loads(open('nlp-config.json').read())[socket.gethostname()] workers = nlp_config['workers'] # Allow user to configure options parser = OptionParser() parser.add_option("-n", "--workers", dest="workers", action="store", default=workers, help="Specify the number of worker processes to open") (options, args) = parser.parse_args() WORKERS = int(options.workers) # Directory variables for query_write are set here; set vars for query_tar there EVENT_DIR = ensure_dir_exists('/data/events/') TEMP_EVENT_DIR = ensure_dir_exists('/data/temp_events/') TEXT_DIR = ensure_dir_exists('/data/text/') TEMP_TEXT_DIR = ensure_dir_exists('/data/temp_text/') def write_text(event_file): """Takes event file as input, writes text from all queries contained in event file to TEXT_DIR, and returns a list of documents written""" for line in open(event_file): query = line.strip() logger.info('Writing query from %s: "%s"' % (current_process(), query)) qi = QueryIterator('http://search-s11.prod.wikia.net:8983/solr/main/', {'query': query, 'fields': 'id,wid,html_en,indexed', 'sort': 'id asc'}) for doc in qi: # Sanitize and write text text = '\n'.join(clean_list(doc.get('html_en', ''))) localpath = os.path.join(TEXT_DIR, doc['id'])
logger.addHandler(fh) sh = logging.StreamHandler() sh.setLevel(logging.INFO) logger.addHandler(sh) # Allow user to configure options parser = OptionParser() parser.add_option('-b', '--batchsize', dest='batchsize', action='store', default=500, help='Specify the maximum number of files in a .tgz batch') parser.add_option('-l', '--local', dest='local', action='store_true', default=False, help='Specify whether to store text files locally instead of on S3') (options, args) = parser.parse_args() BATCHSIZE = options.batchsize LOCAL = options.local # Directory variables for query_tar are set here; set vars for query_write there TEXT_DIR = ensure_dir_exists('/data/text/') TEMP_TEXT_DIR = ensure_dir_exists('/data/temp_text/') if not LOCAL: bucket = S3Connection().get_bucket('nlp-data') if __name__ == '__main__': # Set to run indefinitely while True: try: bypass_minimum = False # Attempt to enforce minimum batch size, continue after 30 seconds if not logger.debug('Checking # of files in text directory...') num_text_files = len(os.listdir(TEXT_DIR))
def save_data(df, outputDir, outputFile): ensure_dir_exists(outputDir) filepath = outputDir + "/" + outputFile print("Saving data to " + filepath) df.to_csv(filepath)
def main(unused_argv): for i in range(0, NUM_LOOP): if i == 0: src_model_name = shipname.generate(0) fsdb.switch_base(os.path.join(base_dir, src_model_name)) src_model_path = os.path.join(fsdb.models_dir(), src_model_name) bootstrap_model_path = os.path.join(fsdb.models_dir(), src_model_name) mask_flags.checked_run([ 'python3', 'bootstrap.py', '--export_path={}'.format(bootstrap_model_path), '--work_dir={}'.format(fsdb.working_dir()), '--flagfile=rl_loop/local_flags' ]) dst_model_name = shipname.generate(1) fsdb.switch_base(os.path.join(base_dir, dst_model_name)) else: src_model_name = dst_model_name src_model_path = os.path.join(fsdb.models_dir(), src_model_name) dst_model_name = shipname.generate(i + 1) fsdb.switch_base(os.path.join(base_dir, dst_model_name)) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.sgf_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) #bootstrap_name = shipname.generate(0) #bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name) print(src_model_name) print(src_model_path) selfplay_cmd = [ 'python3', 'selfplay.py', '--load_file={}'.format(src_model_path), '--selfplay_dir={}'.format( os.path.join(fsdb.selfplay_dir(), dst_model_name)), '--holdout_dir={}'.format( os.path.join(fsdb.holdout_dir(), dst_model_name)), '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0', '--flagfile=rl_loop/local_flags' ] # Selfplay twice mask_flags.checked_run(selfplay_cmd) mask_flags.checked_run(selfplay_cmd) # and once more to generate a held out game for validation # exploits flags behavior where if you pass flag twice, second one wins. mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100']) # Double check that at least one sgf has been generated. assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full')) print("Making shuffled golden chunk from selfplay data...") # TODO(amj): refactor example_buffer so it can be called the same way # as everything else. eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(), local_dir=fsdb.working_dir(), game_dir=fsdb.selfplay_dir(), model_num=1, positions=64, threads=8, sampling_frac=1) tf_records = sorted( gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz'))) #trained_model_name = shipname.generate(1) trained_model_name = dst_model_name trained_model_path = os.path.join(fsdb.models_dir(), trained_model_name) # Train on shuffled game data mask_flags.checked_run([ 'python3', 'train.py', *tf_records, '--work_dir={}'.format(fsdb.working_dir()), '--export_path={}'.format(trained_model_path), '--flagfile=rl_loop/local_flags' ]) print("Finished!")
def main(argv): """Play matches between two neural nets.""" _, black_model, white_model = argv utils.ensure_dir_exists(FLAGS.eval_sgf_dir) play_match(black_model, white_model, FLAGS.num_evaluation_games, FLAGS.eval_sgf_dir)
def run_one_shot(): '''Runs the creation of the index and stores it to disk.''' utils.ensure_dir_exists('output') utils.ensure_dir_exists('data') inverted_index: Dict[str, Tuple[int, set]] = from_scratch_index_creation() save_index_to_disk(inverted_index, outfile=init_params().output_file)
def main(unused_argv): """Bootstrap random weights.""" utils.ensure_dir_exists(os.path.dirname(FLAGS.export_path)) if FLAGS.create_bootstrap: dual_net.bootstrap() dual_net.export_model(FLAGS.export_path)
def swa(): path_base = fsdb.models_dir() model_names = [ "000393-lincoln", "000390-indus", "000404-hannibal", "000447-hawke", "000426-grief", "000431-lion", "000428-invincible", "000303-olympus", "000291-superb", "000454-victorious", ] model_names = model_names[:FLAGS.count] model_paths = [os.path.join(path_base, m) for m in model_names] # construct the graph features, labels = dual_net.get_inference_input() dual_net.model_fn(features, labels, tf.estimator.ModeKeys.PREDICT, FLAGS.flag_values_dict()) # restore all saved weights meta_graph_def = meta_graph.read_meta_graph_file(model_paths[0] + '.meta') stored_var_names = set( [n.name for n in meta_graph_def.graph_def.node if n.op == 'VariableV2']) var_list = [v for v in tf.global_variables() if v.op.name in stored_var_names] var_list.sort(key=lambda v: v.op.name) print(stored_var_names) print(len(stored_var_names), len(var_list)) sessions = [tf.Session() for _ in model_paths] saver = tf.train.Saver() for sess, model_path in zip(sessions, model_paths): saver.restore(sess, model_path) # Load all VariableV2s for each model. values = [sess.run(var_list) for sess in sessions] # Iterate over all variables average values from all models. all_assign = [] for var, vals in zip(var_list, zip(*values)): print("{}x {}".format(len(vals), var)) if var.name == "global_step:0": avg = vals[0] for val in vals: avg = tf.maximum(avg, val) else: avg = tf.add_n(vals) / len(vals) continue all_assign.append(tf.assign(var, avg)) # Run all asign ops on an existing model (which has other ops and graph). sess = sessions[0] sess.run(all_assign) # Export a new saved model. ensure_dir_exists(FLAGS.data_dir) dest_path = os.path.join(FLAGS.data_dir, "swa-" + str(FLAGS.count)) saver.save(sess, dest_path)
def test(self): utils.ensure_dir_exists(self.base_path) return True
def run(sess, f, data): # create graph input_size, output_size = data.io_shape inputs = tf.placeholder(tf.float32, [None, input_size], name='inputs') outputs, _, feed_dicts = m.get(f.model).create_model(inputs, output_size) labels = tf.placeholder(tf.float32, [None, output_size], name='labels') loss, train_step = create_train_ops(outputs, labels, lr=f.lr, loss=f.loss) accuracy = create_eval_ops(outputs, labels, loss=f.loss) summary_op = create_summary_ops(loss, accuracy) # only initialize non-initialized vars: u.init_uninitted_vars(sess) # (this is not super important for training, but its very important # in optimize, and in distill) saver = tf.train.Saver(tf.global_variables()) summary_dir = os.path.join(f.summary_folder, f.run_name, 'train') train_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train'), sess.graph) trainbatch_writer = tf.summary.FileWriter( os.path.join(summary_dir, 'train_batch'), sess.graph) test_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'test'), sess.graph) with sess.as_default(): global_step = 0 for i in range(f.epochs): print('Epoch: {}'.format(i)) for batch_x, batch_y in data.train_epoch_in_batches( f.train_batch_size): summary, _ = sess.run([summary_op, train_step], feed_dict={ **feed_dicts['train'], inputs: batch_x, labels: batch_y }) trainbatch_writer.add_summary(summary, global_step) if global_step % f.eval_interval == 0: # eval test set summaries = [] for test_batch_x, test_batch_y in data.test_epoch_in_batches( f.test_batch_size): summary = sess.run(summary_op, feed_dict={ **feed_dicts['eval'], inputs: test_batch_x, labels: test_batch_y }) summaries.append(summary) test_writer.add_summary( u.merge_summary_list(summaries, True), global_step) # eval train set summaries = [] for train_batch_x, train_batch_y in data.train_epoch_in_batches( f.train_batch_size): summary = sess.run(summary_op, feed_dict={ **feed_dicts['eval'], inputs: train_batch_x, labels: train_batch_y }) summaries.append(summary) train_writer.add_summary( u.merge_summary_list(summaries, True), global_step) global_step += 1 if global_step % f.checkpoint_interval == 0: checkpoint_dir = os.path.join(summary_dir, 'checkpoint/') u.ensure_dir_exists(checkpoint_dir) checkpoint_file = os.path.join(checkpoint_dir, f.model) saved_file = saver.save(sess, checkpoint_file, global_step=global_step) print('saved model at {}'.format(saved_file)) print('saved model at {}'.format(saved_file))
def main(unused_argv): """Run the reinforcement learning loop.""" print('Wiping dir %s' % FLAGS.base_dir, flush=True) shutil.rmtree(FLAGS.base_dir, ignore_errors=True) utils.ensure_dir_exists(fsdb.models_dir()) utils.ensure_dir_exists(fsdb.selfplay_dir()) utils.ensure_dir_exists(fsdb.holdout_dir()) utils.ensure_dir_exists(fsdb.eval_dir()) utils.ensure_dir_exists(fsdb.golden_chunk_dir()) utils.ensure_dir_exists(fsdb.working_dir()) # Copy the flag files so there's no chance of them getting accidentally # overwritten while the RL loop is running. flags_dir = os.path.join(FLAGS.base_dir, 'flags') shutil.copytree(FLAGS.flags_dir, flags_dir) FLAGS.flags_dir = flags_dir # Copy the target model to the models directory so we can find it easily. shutil.copy('ml_perf/target.pb', fsdb.models_dir()) logging.getLogger().addHandler( logging.FileHandler(os.path.join(FLAGS.base_dir, 'rl_loop.log'))) formatter = logging.Formatter('[%(asctime)s] %(message)s', '%Y-%m-%d %H:%M:%S') for handler in logging.getLogger().handlers: handler.setFormatter(formatter) with utils.logged_timer('Total time'): try: rl_loop() finally: asyncio.get_event_loop().close()
# skip until the first batch has completed if tar: tar.close() # remove text files after tarring shutil.rmtree(dest_dir) # send to aws and remove tarball if aws: k.key = 'text_events/%s' % os.path.basename(tar_file) k.set_contents_from_filename(tar_file) os.remove(tar_file) # send post requests for each wid covered in this batch for wid in wids: requests.post('http://nlp-s1:5000/wiki/%i' % wid) wids = [] batch_count += 1 dest_dir = ensure_dir_exists(TEXT_DIR + '%s_%i' % (os.path.basename(qqfile), batch_count)) # open tarball for writing tar_file = dest_dir + '.tgz' tar = tarfile.open(tar_file, 'w:gz') wid = int(doc['wid']) if wid not in wids: wids.append(wid) # sanitize and write text text = '\n'.join(clean_list(doc.get('html_en', ''))) localpath = os.path.join(dest_dir, doc['id']) with open(localpath, 'w') as f: f.write(text) # add text file to tarball tar.add(localpath, doc['id']) doc_count += 1 # tar the final batch and send to aws
def get_operators(opts, verts, faces, k_eig, normals=None, overwrite_cache=False, truncate_cache=False): """ See documentation for compute_operators(). This essentailly just wraps a call to compute_operators, using a cache if possible. All arrays are always computed using double precision for stability, then truncated to single precision floats to store on disk, and finally returned as a tensor with dtype/device matching the `verts` input. """ device = verts.device dtype = verts.dtype verts_np = toNP(verts) faces_np = toNP(faces) is_cloud = faces.numel() == 0 if (np.isnan(verts_np).any()): raise RuntimeError("tried to construct operators from NaN verts") # Check the cache directory # Note 1: Collisions here are exceptionally unlikely, so we could probably just use the hash... # but for good measure we check values nonetheless. # Note 2: There is a small possibility for race conditions to lead to bucket gaps or duplicate # entries in this cache. The good news is that that is totally fine, and at most slightly # slows performance with rare extra cache misses. found = False if opts.eigensystem_cache_dir is not None: utils.ensure_dir_exists(opts.eigensystem_cache_dir) hash_key_str = str(utils.hash_arrays((verts_np, faces_np))) # print("Building operators for input with hash: " + hash_key_str) # Search through buckets with matching hashes. When the loop exits, this # is the bucket index of the file we should write to. i_cache_search = 0 while True: # Form the name of the file to check search_path = os.path.join( opts.eigensystem_cache_dir, hash_key_str + "_" + str(i_cache_search) + ".npz") try: # print('loading path: ' + str(search_path)) npzfile = np.load(search_path, allow_pickle=True) cache_verts = npzfile["verts"] cache_faces = npzfile["faces"] cache_k_eig = npzfile["k_eig"].item() # If the cache doesn't match, keep looking if (not np.array_equal(verts, cache_verts)) or ( not np.array_equal(faces, cache_faces)): i_cache_search += 1 print("hash collision! searching next.") continue # If we're overwriting, or there aren't enough eigenvalues, just delete it; we'll create a new # entry below more eigenvalues if overwrite_cache or cache_k_eig < k_eig: print( " overwiting / not enough eigenvalues --- recomputing" ) os.remove(search_path) break # This entry matches! Return it. found = True frames = npzfile["frames"] mass = npzfile["mass"] evals = npzfile["evals"][:k_eig] evecs = npzfile["evecs"][:, :k_eig] grad_from_spectral = npzfile[ "grad_from_spectral"][:, :k_eig, :] if truncate_cache and cache_k_eig > k_eig: print("TRUNCATING CACHE {} --> {}".format( cache_k_eig, k_eig)) np.savez( search_path, verts=verts_np, frames=frames, faces=faces_np, k_eig=k_eig, mass=mass, evals=evals, evecs=evecs, grad_from_spectral=grad_from_spectral, ) frames = torch.from_numpy(frames).to(device=device, dtype=dtype) mass = torch.from_numpy(mass).to(device=device, dtype=dtype) evals = torch.from_numpy(evals).to(device=device, dtype=dtype) evecs = torch.from_numpy(evecs).to(device=device, dtype=dtype) grad_from_spectral = torch.from_numpy(grad_from_spectral).to( device=device, dtype=dtype) break except FileNotFoundError: print(" cache miss -- constructing operators") break except Exception as E: print("unexpected error loading file: " + str(E)) print("-- constructing operators") break if not found: # No matching entry found; recompute. frames, mass, evals, evecs, grad_from_spectral = compute_operators( verts, faces, k_eig, normals=normals) dtype_np = np.float32 # Store it in the cache if opts.eigensystem_cache_dir is not None: np.savez( search_path, verts=verts_np, frames=toNP(frames).astype(dtype_np), faces=faces_np, k_eig=k_eig, mass=toNP(mass).astype(dtype_np), evals=toNP(evals).astype(dtype_np), evecs=toNP(evecs).astype(dtype_np), grad_from_spectral=toNP(grad_from_spectral).astype(dtype_np), ) return frames, mass, evals, evecs, grad_from_spectral