示例#1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--ticker_file', required=True)
    parser.add_argument('--sample_dir', required=True)
    parser.add_argument('--market_sample_path', required=True)
    parser.add_argument('--output_dir', required=True)
    parser.add_argument('--er_months', default=ER_MONTHS)
    parser.add_argument('--ev_months', default=EV_MONTHS)
    parser.add_argument('--overwrite', action='store_true')
    parser.add_argument('--verbose', action='store_true')
    args = parser.parse_args()

    utils.setup_logging(args.verbose)

    market_samples = utils.read_samples(args.market_sample_path)
    er_months = [int(m) for m in args.er_months.split(',')]
    ev_months = [int(m) for m in args.ev_months.split(',')]

    # Tickers are listed one per line.
    with open(args.ticker_file, 'r') as fp:
        tickers = fp.read().splitlines()
    logging.info('Processing %d tickers' % len(tickers))

    for i in range(len(tickers)):
        ticker = tickers[i]
        assert ticker.find('^') == -1  # ^GSPC should not be in tickers.
        logging.info('%d/%d: %s' % (i + 1, len(tickers), ticker))
        stock_sample_path = '%s/%s.csv' % (args.sample_dir, ticker)
        if not path.isfile(stock_sample_path):
            logging.warning('Input file does not exist: %s' %
                            stock_sample_path)
            continue
        # The output format is no longer csv.  Use txt instead.
        output_path = '%s/%s.txt' % (args.output_dir, ticker)
        if path.isfile(output_path) and not args.overwrite:
            logging.warning('Output file exists: %s, skipping' % output_path)
            continue
        stock_samples = utils.read_samples(stock_sample_path)
        compute_features(stock_samples, market_samples, er_months, ev_months,
                         output_path)
示例#2
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--ticker_file', required=True)
  parser.add_argument('--sample_dir', required=True)
  parser.add_argument('--market_sample_path', required=True)
  parser.add_argument('--output_dir', required=True)
  parser.add_argument('--er_months', default=ER_MONTHS)
  parser.add_argument('--ev_months', default=EV_MONTHS)
  parser.add_argument('--overwrite', action='store_true')
  parser.add_argument('--verbose', action='store_true')
  args = parser.parse_args()

  utils.setup_logging(args.verbose)

  market_samples = utils.read_samples(args.market_sample_path)
  er_months = [int(m) for m in args.er_months.split(',')]
  ev_months = [int(m) for m in args.ev_months.split(',')]

  # Tickers are listed one per line.
  with open(args.ticker_file, 'r') as fp:
    tickers = fp.read().splitlines()
  logging.info('Processing %d tickers' % len(tickers))

  for i in range(len(tickers)):
    ticker = tickers[i]
    assert ticker.find('^') == -1  # ^GSPC should not be in tickers.
    logging.info('%d/%d: %s' % (i+1, len(tickers), ticker))
    stock_sample_path = '%s/%s.csv' % (args.sample_dir, ticker)
    if not path.isfile(stock_sample_path):
      logging.warning('Input file does not exist: %s' % stock_sample_path)
      continue
    # The output format is no longer csv.  Use txt instead.
    output_path = '%s/%s.txt' % (args.output_dir, ticker)
    if path.isfile(output_path) and not args.overwrite:
      logging.warning('Output file exists: %s, skipping' % output_path)
      continue
    stock_samples = utils.read_samples(stock_sample_path)
    compute_features(stock_samples, market_samples, er_months, ev_months,
                     output_path)
示例#3
0
def approximate_function():
    samples = read_samples(sample_file)
    random.shuffle(samples)
    train_data = samples[:int(0.6 * len(samples))]
    val_data = samples[int(0.6 * len(samples)):int(0.8 * len(samples))]
    test_data = samples[int(0.8 * len(samples)):]
    model = Rd_difference_approximator()
    if GPU:
        differential_model = differential_model.cuda()
    R_2 = {'num_dims': 2, 'bounds': [(-1, 1), (-1, 1)]}
    domain = Bounded_Rd(R_2['num_dims'], R_2['bounds'])
    approximator = SingleSampleFunctionApproximator(
        train_data, model=model, model_file='square-single.model')
    optimizer = optim.Adam(model.parameters(), lr=1e-2)
    criterion = nn.MSELoss()
    approximator.approximate(val_data, optimizer, criterion, NUM_EPOCHS)
def main():
    true_path = sys.argv[1]
    samples_path = sys.argv[2]
    base_output_path = sys.argv[3]
    appendix_output_path = sys.argv[4]
    runs = int(sys.argv[5])

    global samples
    samples = utils.read_samples(samples_path)
    samples = np.copy(samples[1000:])

    matches, errors = evaluate_filtered_match_iterative(
        true_path, base_output_path, appendix_output_path, runs)
    print 'Filtered matches:', matches
    print 'Perfect:', np.sum([(x[0] == 24 and x[1] == 24) for x in matches])
    print 'Errors:', errors, np.average(errors)
示例#5
0
def main():
    # read data
    samples = read_samples(DATA_DIR)
    _ = samples.pop(0)
    samples = [append_path(line) for line in samples]

    # create train, validation and test sets
    train_set, test_set = train_test_split(samples, test_size=0.1)
    train_set, valid_set = train_test_split(train_set, test_size=0.1)

    # create data iterators to load and preprocess images
    train_iterator = ImageGenerator(train_set, batch_size=128, corr=0.2)
    valid_iterator = CenterImageGenerator(valid_set, batch_size=128)
    test_iterator = CenterImageGenerator(test_set, batch_size=128)

    model = comma_model()
    early_stopping = EarlyStopping(monitor='val_loss',
                                   min_delta=0,
                                   patience=2,
                                   verbose=1,
                                   mode='auto')
    filepath = 'comma.{epoch:02d}-{val_loss:.2f}.hdf5'
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_loss',
                                 verbose=0,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)

    print('Training comma.ai model')
    model.fit_generator(generator=train_iterator,
                        epochs=10,
                        validation_data=valid_iterator,
                        steps_per_epoch=len(train_iterator),
                        validation_steps=len(valid_iterator),
                        callbacks=[checkpoint, early_stopping])
    print('Done.')

    results = model.evaluate_generator(generator=test_iterator,
                                       steps=len(test_iterator))
    print('Test mse: {}'.format(results))
示例#6
0
        for i in range(0,len(test_data)*len(ref_x),len(ref_x)):
            predictions = [x[0].detach().numpy() for x in pair_outputs[i:i+len(ref_x)]]
            predictions_reference = [x[1].detach() for x in pair_outputs[i:i+len(ref_x)] ]
            weights = [1-abs((x-y)/y) for x,y in zip(predictions_reference,ref_y) ] 
            mean_prediction = np.average(predictions,weights=weights)
            outputs.append(mean_prediction)
        return outputs
    
    def evaluate(self,test_data,criterion):
        predictions = self.predict(test_data)
        print(criterion(Variable(torch.FloatTensor(predictions)),Variable(torch.FloatTensor([x[1] for x in test_data]))))
        
def plot_figure(inputs,predictions,outfile):
    pass

if __name__=='__main__':
    siamese_model = load_pytorch_model('square-siamese.model')
    single_model = load_pytorch_model("square-single.model")
    ood_samples = read_samples('square.csv')
    samples = read_samples('square.csv')
    R_2 ={'num_dims':2,'bounds':[(-1,1),(-1,1)]}
    domain = Bounded_Rd(R_2['num_dims'],R_2['bounds'])
    approximator = SamplePairCoApproximator(samples,domain,differential_model=siamese_model)
    approximator_single = SingleSampleFunctionApproximator(samples,model=single_model)
    criterion = nn.MSELoss()
    approximator.evaluate(ood_samples, criterion)
    approximator_single.evaluate(ood_samples, criterion)
    siamese_predictions = approximator.predict(ood_samples)
    single_predictions = approximator_single.predict(ood_samples)
    plot_figure(ood_samples,siamese_predictions,'results-siamese.png')
    plot_figure(ood_samples,single_predictions,'results-single.png')
示例#7
0
文件: run.py 项目: aminnj/duck
if u.proxy_hours_left() < 60:
    print ">>> Proxy near end of lifetime, renewing."
    u.proxy_renew()

u.copy_jecs()


all_samples = []
for i in range(5000):

    data = { "samples": [], "last_updated": None }

    # read instructions file. if new sample found, add it to list
    # for existing samples, try to update params (xsec, kfact, etc.)
    for samp in u.read_samples(instructions):
        if samp not in all_samples:
            if DO_TEST: 
                samp["specialdir_test"] = True
                print ">>> You have specified DO_TEST, so final samples will end up in snt/test/!"
            s = Samples.Sample(**samp) 
            all_samples.append(s)
        else:
            all_samples[all_samples.index(samp)].update_params(samp)

    # for isample, s in enumerate(all_samples):
    #     s.nuke()
    # sys.exit()

    for isample, s in enumerate(all_samples):
        stat = s.get_status()
示例#8
0
def train_model(args: dict, hparams:dict):
    
    file = args.dataset_filepath
    # truncation = args.truncation

    seed_val = hparams["seed_val"]
    device = utils.get_device(device_no=args.device_no)
    saves_dir = "saves/"

    Path(saves_dir).mkdir(parents=True, exist_ok=True)   
    time = datetime.datetime.now()

    saves_path = os.path.join(saves_dir, utils.get_filename(time))
    Path(saves_path).mkdir(parents=True, exist_ok=True)

    log_path = os.path.join(saves_path, "training.log")

    logging.basicConfig(filename=log_path, filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
    logger=logging.getLogger()

    logger.info("File: "+str(file))
    logger.info("Parameters: "+str(args))
    logger.info("Hyperparameters: "+str(hparams))
    # logger.info("Truncation: "+truncation)

    # Load the BERT tokenizer.
    logger.info('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    max_len = 0

    samples = utils.read_samples(file)

    article_type_map = {}

    if args.lcr:
        samples = [[val[0].lower()+" [SEP] "+val[1].lower()+" [SEP] "+val[2].lower(), val[3], val[4]] for val in samples]        
    else:
        samples = [[val[0].lower()+" [SEP] "+val[1].lower(), val[2], val[3]] for val in samples]

    for s in samples:
        article_type_map[s[0]] = s[2]

    # samples = samples[:100]
    # if args.binary_classifier:        
    #     samples = utils.read_pairwise(file, args.data_1, args.data_2, dataset_amount=args.dataset_amount)
    # else:
    #     samples = utils.read_and_sample(file, dataset_amount=args.dataset_amount)

    no_of_labels = len(set([val[1] for val in samples]))

    logger.info("No of unique labels: "+str(no_of_labels))

    train_size = int(0.9 * len(samples))
    val_size = len(samples) - train_size

    random.shuffle(samples)

    train_samples = samples[:train_size]
    val_samples = samples[train_size:]
    
    train_samples_text = [val[0] for val in train_samples]
    train_samples_label = [val[1] for val in train_samples]
    val_samples_text = [val[0] for val in val_samples]
    val_samples_label = [val[1] for val in val_samples]

    max_len = 0

    # For every sentence...
    for text in train_samples_text+val_samples_text:
        
        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_id = tokenizer(text, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_id['input_ids']))

    logger.info('Max text length: ' + str(max_len))

    max_len = pow(2, math.ceil(math.log2(max_len)))
    max_len = min(512, max_len)
    
    batch_size = args.batch_size

    (train_input_ids, train_attention_masks, train_samples_label_tensor) = make_smart_batches(train_samples_text, train_samples_label, batch_size, logger, tokenizer, max_len)
    (val_input_ids, val_attention_masks, val_samples_label_tensor) = make_smart_batches(val_samples_text, val_samples_label, batch_size, logger, tokenizer, max_len)

    logger.info('{:>5,} training samples'.format(train_size))
    logger.info('{:>5,} validation samples'.format(val_size))

    model = BertForSequenceClassification.from_pretrained(        
        "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = no_of_labels, # The number of output labels--2 for binary classification.
                        # You can increase this for multi-class tasks.   
        output_attentions = False, # Whether the model returns attentions weights.
        output_hidden_states = False, # Whether the model returns all hidden-states.        
    )
    
    
    model = model.to(device=device)    
    # model.cuda(device=device)

    optimizer = AdamW(model.parameters(),
                    lr = args.learning_rate, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                    eps = hparams["adam_epsilon"] # args.adam_epsilon  - default is 1e-8.
                    )
    epochs = args.n_epochs

    total_steps = len(train_input_ids) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []

    correct_counts = {
        "domestic": 0,
        "international": 0
    }
    total_counts = {
        "domestic": 0,
        "international": 0
    }
    for epoch_i in range(0, epochs):
        
        logger.info("")
        logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        logger.info('Training...')
        
        total_train_loss = 0

        model.train()

        step = 0
        for batch in zip(train_input_ids, train_attention_masks, train_samples_label_tensor):
        
            if step % 40 == 0 and not step == 0:               
                logger.info('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_input_ids)))

            b_input_ids = batch[0].to(device=device)
            b_input_mask = batch[1].to(device=device)


            b_labels = batch[2].to(device=device) 
            
            # Converting labels to float32 because I was getting some runtime error. 
            # Not sure why we need to make labels float32. Keeping it Long or int64 works in case of headlines.
            # b_labels = batch[2].to(device=device, dtype=torch.float32) 


            model.zero_grad()        

            loss, logits = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)

            total_train_loss += loss.detach().cpu().numpy()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

            step+=1
        
        avg_train_loss = total_train_loss / len(train_input_ids)            

        logger.info("")
        logger.info("Average training loss: {0:.2f}".format(avg_train_loss))

            
        logger.info("")
        logger.info("Running Validation...")

        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0

        for batch in zip(val_input_ids, val_attention_masks, val_samples_label_tensor):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            with torch.no_grad():        

                (loss, logits) = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
                
            total_eval_loss += loss.detach().cpu().numpy()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)
                        
            for idx in range(batch_size):
                pred = np.argmax(logits[idx]) == label_ids[idx]
                tokens = tokenizer.convert_ids_to_tokens(b_input_ids[idx])
                

        avg_val_accuracy = total_eval_accuracy / len(val_input_ids)
        logger.info("Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = total_eval_loss / len(val_input_ids)
               
        logger.info("Validation Loss: {0:.2f}".format(avg_val_loss))        

        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,                
            }
        )

        model_save_path = os.path.join(saves_path, "model_"+str(epoch_i+1)+"epochs")
        torch.save(model, model_save_path)

    logger.info("")
    logger.info("Training complete!")
    handlers = logger.handlers[:]
    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)
示例#9
0
    'epochs': 2000,
    'heldout': 100
}
batch_size = config['batch_size']

if len(sys.argv) < 4:
    print "usage: python train_relu.py samples_path output_path topics seed"
    sys.exit()
samples_path = sys.argv[1]
output_path = sys.argv[2]
K = int(sys.argv[3])
if len(sys.argv) > 4:
    config['seed'] = int(sys.argv[4])
    np.random.seed(config['seed'])

samples = utils.read_samples(samples_path)
N = samples.shape[1]

samples_heldout = np.copy(samples[:config['heldout']])
samples = np.copy(samples[config['heldout']:])
np.random.shuffle(samples)
S = samples.shape[0]  # number of samples
S_heldout = samples_heldout.shape[0]  # number of heldout samples

A = utils.get_initial_A_gaussian(N, K)

sys.stdout = open(output_path + '/' + 'log.txt', 'w')
print config

# Train.
for epoch in range(config['epochs']):
示例#10
0
def main(instructions=None, params=None, do_one_iteration=False):
    if not instructions:
        return

    if not params:
        params = __import__('params')

    data_json = "data.json"
    actions_fname = os.path.abspath(__file__).rsplit("/",
                                                     1)[0] + "/actions.txt"

    u.copy_jecs()
    logger_name = u.setup_logger()
    logger = logging.getLogger(logger_name)

    time_stats = []
    if os.path.isfile(data_json):
        with open(data_json, "r") as fhin:
            data = json.load(fhin)
            if "time_stats" in data: time_stats = data["time_stats"]

    all_samples = []
    for i in range(5000):

        if u.proxy_hours_left() < 60 and not params.FORSAKE_HEAVENLY_PROXY:
            u.proxy_renew()

        data = {"samples": [], "last_updated": None, "time_stats": time_stats}

        # read instructions file. if new sample found, add it to list
        # for existing samples, try to update params (xsec, kfact, etc.)
        for samp in u.read_samples(instructions):
            samp["params"] = params
            if samp not in all_samples:
                s = Samples.Sample(**samp)
                all_samples.append(s)
            else:
                all_samples[all_samples.index(samp)].update_params(samp)

        n_done = 0
        n_samples = len(all_samples)
        for isample, s in enumerate(all_samples):

            try:
                stat = s.get_status()
                typ = s.get_type()

                # grab actions from a text file and act on them, consuming (removing) them if successful
                for dataset_name, action in u.get_actions(
                        actions_fname=actions_fname,
                        dataset_name=s["dataset"]):
                    if s.handle_action(action):
                        u.consume_actions(dataset_name=s["dataset"],
                                          action=action,
                                          actions_fname=actions_fname)

                if not s.pass_tsa_prechecks(): continue

                if typ == "CMS3":

                    if stat == "new":
                        s.crab_submit()
                    elif stat == "crab":
                        s.crab_parse_status()
                        if s.is_crab_done():
                            s.make_miniaod_map()
                            s.make_merging_chunks()
                            s.submit_merge_jobs()
                    elif stat == "postprocessing":
                        if s.is_merging_done():
                            if s.check_output():
                                s.make_metadata()
                                s.copy_files()
                        else:
                            s.submit_merge_jobs()
                    elif stat == "done":
                        s.do_done_stuff()
                        n_done += 1

                elif typ == "BABY":

                    if stat == "new":
                        s.set_baby_inputs()
                        s.submit_baby_jobs()

                    elif stat == "condor" or stat == "postprocessing":
                        if params.open_datasets:
                            s.check_new_merged_for_babies()

                        if not params.open_datasets and s.is_babymaking_done():
                            s.set_status("done")
                        else:
                            s.sweep_babies()
                            s.submit_baby_jobs()

                    elif stat == "done":
                        if params.open_datasets:
                            s.check_new_merged_for_babies()
                        else:
                            s.do_done_stuff()
                            n_done += 1

                s.save()
                data["samples"].append(s.get_slimmed_dict())

            except Exception, err:
                logger.info(
                    "send an (angry?) email to Nick with the Traceback below!!"
                )
                logger.info(traceback.format_exc())

        breakdown_crab = u.sum_dicts([
            samp["crab"]["breakdown"] for samp in data["samples"]
            if "crab" in samp and "breakdown" in samp["crab"]
        ])
        # breakdown_baby = u.sum_dicts([{"baby_"+key:samp["baby"].get(key,0) for key in ["running", "sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"])
        breakdown_baby = u.sum_dicts([{
            "running_babies":
            samp["baby"]["running"],
            "sweepRooted_babies":
            samp["baby"]["sweepRooted"]
        } for samp in data["samples"] if samp["type"] == "BABY"])
        tot_breakdown = u.sum_dicts([breakdown_crab, breakdown_baby])
        data["last_updated"] = u.get_timestamp()
        data["time_stats"].append((u.get_timestamp(), tot_breakdown))
        data["log"] = u.get_last_n_lines(fname=params.log_file, N=100)
        with open(data_json, "w") as fhout:
            data["samples"] = sorted(
                data["samples"],
                key=lambda x: x.get("status", "done") == "done")
            json.dump(data, fhout, sort_keys=True, indent=4)
        u.copy_json(params)

        if params.exit_when_done and (n_done == n_samples):
            print ">>> All %i samples are done. Exiting." % n_samples
            break

        if not do_one_iteration:
            sleep_time = 60 if i < 2 else 2 * 600
            logger.debug("sleeping for %i seconds..." % sleep_time)
            u.smart_sleep(sleep_time,
                          files_to_watch=[actions_fname, instructions])
        else:
            break
示例#11
0
# Pick the proxy
proxy_file_dict = {}
if not params.FORSAKE_HEAVENLY_PROXY: proxy_file_dict = {"proxy": u.get_proxy_file()}
else: print ">>> You have chosen to forsake your heavenly proxy. Be wary of prompts for your password."

# Check write permissions
print BLUE,"Checking write permissions to UCSD...",ENDC
out = crabCommand('checkwrite', site="T2_US_UCSD", **proxy_file_dict)
print "Done. Status: %s" % out["status"]
print


# Take first dataset name in instructions.txt
print BLUE, "Taking the first sample in instructions.txt. If it's not a FullSim MC sample, then you're going to have a bad time!", ENDC
sample = u.read_samples()[0]
dataset_name = sample["dataset"]
gtag = sample["gtag"]
print "  --> %s" % dataset_name
print


# Find the smallest MINIAOD file
filelist = dis.query(dataset_name, detail=True, typ="files")
filelist = filelist["response"]["payload"]
filelist = sorted(filelist, key=lambda x: x.get("sizeGB", 999.0))
smallest_filename = filelist[0]["name"]
print BLUE, "Smallest file", ENDC
print "  --> %s" % smallest_filename
print 
示例#12
0
文件: run.py 项目: cmstas/NtupleTools
def main(instructions=None, params=None, do_one_iteration=False):
    if not instructions:
        return

    if not params:
        params = __import__('params')

    data_json = "data.json"
    actions_fname = os.path.abspath(__file__).rsplit("/",1)[0]+"/actions.txt"

    u.copy_jecs()
    logger_name = u.setup_logger()
    logger = logging.getLogger(logger_name)


    time_stats = []
    if os.path.isfile(data_json):
        with open(data_json, "r") as fhin:
            data = json.load(fhin)
            if "time_stats" in data: time_stats = data["time_stats"]

    all_samples = []
    for i in range(5000):

        if u.proxy_hours_left() < 60 and not params.FORSAKE_HEAVENLY_PROXY: u.proxy_renew()

        data = { "samples": [], "last_updated": None, "time_stats": time_stats }

        # read instructions file. if new sample found, add it to list
        # for existing samples, try to update params (xsec, kfact, etc.)
        for samp in u.read_samples(instructions):
            samp["params"] = params
            if samp not in all_samples:
                s = Samples.Sample(**samp) 
                all_samples.append(s)
            else:
                all_samples[all_samples.index(samp)].update_params(samp)


        n_done = 0
        n_samples = len(all_samples)
        for isample, s in enumerate(all_samples):

            try:
                stat = s.get_status()
                typ = s.get_type()

                # grab actions from a text file and act on them, consuming (removing) them if successful
                for dataset_name, action in u.get_actions(actions_fname=actions_fname,dataset_name=s["dataset"]):
                    if s.handle_action(action):
                        u.consume_actions(dataset_name=s["dataset"],action=action, actions_fname=actions_fname)

                if not s.pass_tsa_prechecks(): continue


                if typ == "CMS3":

                    if stat == "new":
                        s.crab_submit()
                    elif stat == "crab":
                        s.crab_parse_status()
                        if s.is_crab_done():
                            s.make_miniaod_map()
                            s.make_merging_chunks()
                            s.submit_merge_jobs()
                    elif stat == "postprocessing":
                        if s.is_merging_done():
                            if s.check_output():
                                s.make_metadata()
                                s.copy_files()
                        else:
                            s.submit_merge_jobs()
                    elif stat == "done":
                        s.do_done_stuff()
                        n_done += 1

                elif typ == "BABY":
                    
                    if stat == "new":
                        s.set_baby_inputs()
                        s.submit_baby_jobs()

                    elif stat == "condor" or stat == "postprocessing":
                        if params.open_datasets:
                            s.check_new_merged_for_babies()

                        if not params.open_datasets and s.is_babymaking_done():
                            s.set_status("done")
                        else:
                            # s.sweep_babies()
                            s.sweep_babies_parallel()
                            s.submit_baby_jobs()

                    elif stat == "done":
                        if params.open_datasets:
                            s.check_new_merged_for_babies()
                        else:
                            s.do_done_stuff()
                            n_done += 1


                s.save()
                data["samples"].append( s.get_slimmed_dict() )

            except Exception, err:
                logger.info( "send an (angry?) email to Nick with the Traceback below!!")
                logger.info( traceback.format_exc() )

        breakdown_crab = u.sum_dicts([samp["crab"]["breakdown"] for samp in data["samples"] if "crab" in samp and "breakdown" in samp["crab"]])
        # breakdown_baby = u.sum_dicts([{"baby_"+key:samp["baby"].get(key,0) for key in ["running", "sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"])
        breakdown_baby = u.sum_dicts([{"running_babies":samp["baby"]["running"], "sweepRooted_babies":samp["baby"]["sweepRooted"]} for samp in data["samples"] if samp["type"] == "BABY"])
        tot_breakdown = u.sum_dicts([breakdown_crab, breakdown_baby])
        data["last_updated"] = u.get_timestamp()
        data["time_stats"].append( (u.get_timestamp(), tot_breakdown) )
        data["log"] = u.get_last_n_lines(fname=params.log_file, N=100)
        with open(data_json, "w") as fhout:
            data["samples"] = sorted(data["samples"], key=lambda x: x.get("status","done")=="done")
            json.dump(data, fhout, sort_keys = True, indent = 4)
        u.copy_json(params)

        if params.exit_when_done and (n_done == n_samples):
            print ">>> All %i samples are done. Exiting." % n_samples
            break

        if not do_one_iteration:
            sleep_time = 60 if i < 2 else 2*600
            logger.debug("sleeping for %i seconds..." % sleep_time)
            u.smart_sleep(sleep_time, files_to_watch=[actions_fname, instructions])
        else:
            break
示例#13
0
# Pick the proxy
proxy_file_dict = {}
if not params.FORSAKE_HEAVENLY_PROXY:
    proxy_file_dict = {"proxy": u.get_proxy_file()}
else:
    print ">>> You have chosen to forsake your heavenly proxy. Be wary of prompts for your password."

# Check write permissions
print BLUE, "Checking write permissions to UCSD...", ENDC
out = crabCommand('checkwrite', site="T2_US_UCSD", **proxy_file_dict)
print "Done. Status: %s" % out["status"]
print

# Take first dataset name in instructions.txt
print BLUE, "Taking the first sample in instructions.txt. If it's not a FullSim MC sample, then you're going to have a bad time!", ENDC
sample = u.read_samples()[0]
dataset_name = sample["dataset"]
gtag = sample["gtag"]
print "  --> %s" % dataset_name
print

# Find the smallest MINIAOD file
filelist = dis.query(dataset_name, detail=True, typ="files")
filelist = filelist["response"]["payload"]
filelist = sorted(filelist, key=lambda x: x.get("sizeGB", 999.0))
smallest_filename = filelist[0]["name"]
print BLUE, "Smallest file", ENDC
print "  --> %s" % smallest_filename
print

# Use xrootd to get that file
def train_model(args: dict, hparams: dict):

    file = args.dataset_filepath
    # truncation = args.truncation

    seed_val = hparams["seed_val"]
    device = utils.get_device(device_no=args.device_no)
    saves_dir = "saves/bert/"

    Path(saves_dir).mkdir(parents=True, exist_ok=True)
    time = datetime.datetime.now()

    saves_path = os.path.join(saves_dir, utils.get_filename(time))
    Path(saves_path).mkdir(parents=True, exist_ok=True)

    log_path = os.path.join(saves_path, "training.log")

    summary_filename = os.path.join(saves_path, "tensorboard_summary")
    writer = SummaryWriter(summary_filename)

    logging.basicConfig(filename=log_path,
                        filemode='w',
                        format='%(name)s - %(levelname)s - %(message)s',
                        level=logging.DEBUG)
    logger = logging.getLogger()

    logger.info("File: " + str(file))
    logger.info("Parameters: " + str(args))
    logger.info("Hyperparameters: " + str(hparams))
    # logger.info("Truncation: "+truncation)

    # Load the BERT tokenizer.
    logger.info('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    max_len = 0

    samples = utils.read_samples(file)

    article_type_map = {}

    if not args.nonpair_data:
        if args.lcr:
            samples = [[
                val[0].lower() + " [SEP] " + val[1].lower() + " [SEP] " +
                val[2].lower(), val[3], val[4]
            ] for val in samples]
        else:
            samples = [[
                val[0].lower() + " [SEP] " + val[1].lower(), val[2], val[3]
            ] for val in samples]

    if args.group_by_domestic:
        samples_new = []
        for s in samples:
            # article_type_map[s[0]] = s[2]
            if s[2] == "domestic":
                samples_new.append([s[0], s[1], 0])
            else:
                samples_new.append([s[0], s[1], 1])

        samples = samples_new
    # samples = samples[:100]
    # if args.binary_classifier:
    #     samples = utils.read_pairwise(file, args.data_1, args.data_2, dataset_amount=args.dataset_amount)
    # else:
    #     samples = utils.read_and_sample(file, dataset_amount=args.dataset_amount)

    no_of_labels = len(set([val[1] for val in samples]))

    logger.info("No of unique labels: " + str(no_of_labels))

    # train_size = int(0.9 * len(samples))
    # val_size = len(samples) - train_size

    # random.shuffle(samples)

    # train_samples = samples[:train_size]
    # val_samples = samples[train_size:]

    # train_samples_text = [val[0] for val in train_samples]
    # train_samples_label = [val[1] for val in train_samples]
    # val_samples_text = [val[0] for val in val_samples]
    # val_samples_label = [val[1] for val in val_samples]

    samples_text = [val[0] for val in samples]
    samples_label = [val[1] for val in samples]
    if args.group_by_domestic:
        samples_article_type = [val[2] for val in samples]

    max_len = 0

    input_ids = []
    attention_masks = []

    # For every sentence...
    for text in samples_text:
        input_id = tokenizer(text, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_id['input_ids']))

    logger.info('Max text length: ' + str(max_len))

    max_len = pow(2, math.ceil(math.log2(max_len)))
    max_len = min(512, max_len)

    for text in samples_text:
        input_id = tokenizer(text, add_special_tokens=True)
        if len(input_id) > 512:
            if args.truncation == "tail-only":
                input_id['input_ids'] = [tokenizer.cls_token_id
                                         ] + input_id['input_ids'][-511:]
            elif args.truncation == "head-and-tail":
                input_id['input_ids'] = [tokenizer.cls_token_id] + input_id[
                    'input_ids'][1:129] + input_id['input_ids'][-382:] + [
                        tokenizer.sep_token_id
                    ]
            else:
                input_id['input_ids'] = input_id['input_ids'][:511] + [
                    tokenizer.sep_token_id
                ]

            input_ids.append(torch.tensor(input_id['input_ids']).view(1, -1))
            attention_masks.append(
                torch.ones([1, len(input_id['input_ids'])], dtype=torch.long))
        else:
            encoded_dict = tokenizer(
                text,  # Sentence to encode.
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                max_length=max_len,  # Pad & truncate all sentences.
                pad_to_max_length=True,
                return_attention_mask=True,  # Construct attn. masks.
                return_tensors='pt',  # Return pytorch tensors.
            )
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])

    batch_size = args.batch_size
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(samples_label)
    if args.group_by_domestic:
        samples_article_type_tensor = torch.tensor(samples_article_type)

    # Combine the training inputs into a TensorDataset.
    if args.group_by_domestic:
        dataset = TensorDataset(input_ids, attention_masks, labels,
                                samples_article_type_tensor)
    else:
        dataset = TensorDataset(input_ids, attention_masks, labels)

    # (train_input_ids, train_attention_masks, train_samples_label_tensor) = make_smart_batches(train_samples_text, train_samples_label, batch_size, logger, tokenizer, max_len)
    # (val_input_ids, val_attention_masks, val_samples_label_tensor) = make_smart_batches(val_samples_text, val_samples_label, batch_size, logger, tokenizer, max_len)

    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    logger.info('{:>5,} training samples'.format(train_size))
    logger.info('{:>5,} validation samples'.format(val_size))

    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    validation_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(
            val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        no_of_labels,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.        
    )

    model = model.to(device=device)
    # model.cuda(device=device)

    optimizer = AdamW(
        model.parameters(),
        lr=args.
        learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=hparams["adam_epsilon"]  # args.adam_epsilon  - default is 1e-8.
    )
    epochs = args.n_epochs

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []
    best_stats = {
        'epoch': 0,
        'training_loss': -sys.maxsize,
        'training_accuracy': -sys.maxsize,
        'validation_loss': -sys.maxsize,
        'validation_accuracy': -sys.maxsize,
    }

    for epoch_i in range(0, epochs):

        if len(training_stats) > 2:
            if training_stats[-1]['validation_accuracy'] <= training_stats[-2]['validation_accuracy'] \
                and training_stats[-2]['validation_accuracy'] <= training_stats[-3]['validation_accuracy']:
                break

        correct_counts = {"domestic": 0, "international": 0}
        total_counts = {"domestic": 0, "international": 0}

        logger.info("")
        logger.info('======== Epoch {:} / {:} ========'.format(
            epoch_i + 1, epochs))
        logger.info('Training...')

        total_train_loss = 0
        total_train_accuracy = 0

        model.train()

        step = 0
        for step, batch in enumerate(train_dataloader):

            if step % 40 == 0 and not step == 0:
                logger.info('  Batch {:>5,}  of  {:>5,}. '.format(
                    step, len(train_dataloader)))

            b_input_ids = batch[0].to(device=device)
            b_input_mask = batch[1].to(device=device)
            b_labels = batch[2].to(device=device)

            # Converting labels to float32 because I was getting some runtime error.
            # Not sure why we need to make labels float32. Keeping it Long or int64 works in case of headlines.
            # b_labels = batch[2].to(device=device, dtype=torch.float32)

            model.zero_grad()

            loss, logits = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            total_train_accuracy += flat_accuracy(logits, label_ids)
            total_train_loss += loss.detach().cpu().numpy()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            step += 1

        avg_train_accuracy = total_train_accuracy / len(train_dataloader)
        logger.info("")
        logger.info(
            "Average training accuracy: {0:.2f}".format(avg_train_accuracy))

        avg_train_loss = total_train_loss / len(train_dataloader)

        logger.info("")
        logger.info("Average training loss: {0:.2f}".format(avg_train_loss))

        logger.info("")
        logger.info("Running Validation...")

        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            if args.group_by_domestic:
                b_article_types = batch[3].to(device)

            with torch.no_grad():

                (loss, logits) = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask,
                                       labels=b_labels)

            total_eval_loss += loss.detach().cpu().numpy()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)

            if args.group_by_domestic:
                for idx in range(len(b_labels)):
                    pred = np.argmax(logits[idx]) == label_ids[idx]
                    if b_article_types[idx] == 0:
                        if pred == True:
                            correct_counts["domestic"] += 1
                        total_counts["domestic"] += 1

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        logger.info(
            "Avg validation accuracy: {0:.2f}".format(avg_val_accuracy))

        if args.group_by_domestic:
            avg_val_accuracy_domestic = correct_counts[
                "domestic"] / total_counts["domestic"]

            logger.info("Domestic validation accuracy: {0:.2f}".format(
                avg_val_accuracy_domestic))

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        logger.info("Validation Loss: {0:.2f}".format(avg_val_loss))

        training_stats.append({
            'epoch': epoch_i + 1,
            'training_loss': avg_train_loss,
            'training_accuracy': avg_train_accuracy,
            'validation_loss': avg_val_loss,
            'validation_accuracy': avg_val_accuracy,
        })
        if avg_val_accuracy > best_stats['validation_accuracy']:
            best_stats = {
                'epoch': epoch_i + 1,
                'training_loss': avg_train_loss,
                'training_accuracy': avg_train_accuracy,
                'validation_loss': avg_val_loss,
                'validation_accuracy': avg_val_accuracy,
            }

        writer.add_scalars(
            'losses_and_accuracies', {
                'training_loss': avg_train_loss,
                'training_accuracy': avg_train_accuracy,
                'validation_loss': avg_val_loss,
                'validation_accuracy': avg_val_accuracy,
            }, epoch_i + 1)

        model_save_path = os.path.join(saves_path,
                                       "model_" + str(epoch_i + 1) + "epochs")
        torch.save(model, model_save_path)

    logger.info("")
    logger.info("Training complete!")
    logger.info("Best stats")
    logger.info("training_accuracy: {}".format(best_stats['training_loss']))
    logger.info("training_loss: {}".format(best_stats['training_loss']))
    logger.info("validation_accuracy: {}".format(
        best_stats['validation_accuracy']))
    logger.info("validation_loss: {}".format(best_stats['validation_loss']))

    handlers = logger.handlers[:]

    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)