示例#1
0
def main():
    dataset, version, nbfiles, pos_tags, tfidf, args = parse_args()

    corpus_type = "tfidf" if tfidf else "bow"

    logger = init_logging(name=f'MM_{dataset}_{corpus_type}',
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info if logger else print
    log_args(logger, args)

    texts, stats, nbfiles = make_texts(dataset, nbfiles, pos_tags, logg=logg)
    gc.collect()

    file_name = f'{dataset}{nbfiles if nbfiles else ""}_{version}'
    directory = join(LDA_PATH, version)
    if not exists(directory):
        makedirs(directory)

    # --- saving texts ---
    file_path = join(directory, f'{file_name}_texts.json')
    logg(f'Saving {file_path}')
    with open(file_path, 'w') as fp:
        json.dump(texts, fp, ensure_ascii=False)

    # --- saving stats ---
    file_path = join(directory, f'{file_name}_stats.json')
    logg(f'Saving {file_path}')
    with open(file_path, 'w') as fp:
        json.dump(stats, fp)

    # generate and save the dataset as bow or tfidf corpus in Matrix Market format,
    # including dictionary, texts (json) and some stats about corpus size (json)
    corpus, dictionary = texts2corpus(texts,
                                      tfidf=tfidf,
                                      filter_below=5,
                                      filter_above=0.5,
                                      logg=logg)

    file_name += f'_{corpus_type}'
    directory = join(directory, corpus_type)

    # --- saving corpus ---
    file_path = join(directory, f'{file_name}.mm')
    logg(f'Saving {file_path}')
    MmCorpus.serialize(file_path, corpus)

    # --- saving dictionary ---
    file_path = join(directory, f'{file_name}.dict')
    logg(f'Saving {file_path}')
    dictionary.save(file_path)
示例#2
0
def main():
    (dataset, version, corpus_type, metrics, params, nbtopics, topn, cores,
     coh, vec, weight, oop, evaluate, save, plot, args) = parse_args()

    # --- logging ---
    logger = init_logging(name=f'Reranking_{dataset}',
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info
    log_args(logger, args)
    t0 = time()

    reranker = Reranker(dataset=dataset,
                        version=version,
                        corpus_type=corpus_type,
                        params=params,
                        nbtopics=nbtopics,
                        nb_candidate_terms=topn,
                        nb_top_terms=10,
                        processes=cores,
                        logg=logg)
    if coh:
        reranker.rerank_coherence(metrics)
    if vec:
        reranker.rerank_w2v()
    if weight:
        reranker.weight_score()
    if oop:
        reranker.oop_score()
    if evaluate:
        reranker.evaluate()
    if save:
        reranker.save_results()
    if plot:
        reranker.plot()

    logg(f'final shape {reranker.topic_candidates.shape}')
    assert len(reranker.topic_candidates) == 24975

    t1 = int(time() - t0)
    logg(f">>> done in {t1//3600:02d}:{(t1//60)%60:02d}:{t1%60:02d} <<<")
    return reranker
示例#3
0
def main(args: argparse.Namespace) -> None:
    """
    Main method for building tf examples from individual book (.npy) files

    :param args: ArgumentParser-parsed arguments
    :return: None
    """
    utils.log_args(args)

    if args.sent_per_book != -1:
        utils.warn("Using a max number of sentences per book")

    # Initialize the list of output files to write the examples to
    output_files = []
    for i_tf_ex in range(args.num_example_files):
        cur_tf_file_name = "%d_TfExample.tfrecord" % i_tf_ex
        output_files.append(os.path.join(args.output_dir, cur_tf_file_name))

    # Generate examples
    with tf_example_utils.WriteAsTfExample(output_files, args.vocab_file,
                                           args.max_num_tokens) as writer:
        generate_tf_example(args, writer)
示例#4
0
def main():
    parser = argparse.ArgumentParser()

    # Settings
    parser.add_argument('-d',
                        '--dataset',
                        choices=dataset_attributes.keys(),
                        required=True)
    parser.add_argument('-s',
                        '--shift_type',
                        choices=shift_types,
                        required=True)
    # Confounders
    parser.add_argument('-t', '--target_name')
    parser.add_argument('-c', '--confounder_names', nargs='+')
    # Resume?
    parser.add_argument('--resume', default=False, action='store_true')
    # Label shifts
    parser.add_argument('--minority_fraction', type=float)
    parser.add_argument('--imbalance_ratio', type=float)
    # Data
    parser.add_argument('--fraction', type=float, default=1.0)
    parser.add_argument('--root_dir', default=None)
    parser.add_argument('--subsample_to_minority',
                        action='store_true',
                        default=False)
    parser.add_argument('--reweight_groups',
                        action='store_true',
                        default=False)
    parser.add_argument('--augment_data', action='store_true', default=False)
    parser.add_argument('--val_fraction', type=float, default=0.1)
    # Objective
    parser.add_argument('--robust', default=False, action='store_true')
    parser.add_argument('--alpha', type=float, default=0.2)
    parser.add_argument('--generalization_adjustment', default="0.0")
    parser.add_argument('--automatic_adjustment',
                        default=False,
                        action='store_true')
    parser.add_argument('--robust_step_size', default=0.01, type=float)
    parser.add_argument('--use_normalized_loss',
                        default=False,
                        action='store_true')
    parser.add_argument('--btl', default=False, action='store_true')
    parser.add_argument('--hinge', default=False, action='store_true')

    # Model
    parser.add_argument('--model',
                        choices=model_attributes.keys(),
                        default='resnet50')
    parser.add_argument('--train_from_scratch',
                        action='store_true',
                        default=False)
    parser.add_argument('--resnet_width', type=int, default=None)

    # Optimization
    parser.add_argument('--n_epochs', type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--scheduler', action='store_true', default=False)
    parser.add_argument('--weight_decay', type=float, default=5e-5)
    parser.add_argument('--gamma', type=float, default=0.1)
    parser.add_argument('--minimum_variational_weight', type=float, default=0)
    # Misc
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--show_progress', default=False, action='store_true')
    parser.add_argument('--log_dir', default='./logs')
    parser.add_argument('--log_every', default=50, type=int)
    parser.add_argument('--save_step', type=int, default=10)
    parser.add_argument('--save_best', action='store_true', default=False)
    parser.add_argument('--save_last', action='store_true', default=True)
    parser.add_argument('--student_width', type=int)
    parser.add_argument('--teacher_dir', type=str)
    parser.add_argument('--teacher_width', type=int)
    parser.add_argument('--gpu', type=str)
    parser.add_argument('--temp', type=str)

    args = parser.parse_args()
    gpu = args.gpu
    temp = args.temp
    check_args(args)
    teacher_dir = args.teacher_dir
    student_width = args.student_width
    teacher_width = args.teacher_width
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    def DistillationLoss(temperature):
        cross_entropy = torch.nn.CrossEntropyLoss()

        def loss(student_logits, teacher_logits, target):
            last_dim = len(student_logits.shape) - 1
            p_t = nn.functional.softmax(teacher_logits / temperature,
                                        dim=last_dim)
            log_p_s = nn.functional.log_softmax(student_logits / temperature,
                                                dim=last_dim)
            return cross_entropy(student_logits, target) - (p_t * log_p_s).sum(
                dim=last_dim).mean() * temperature**2

        return loss

    # BERT-specific configs copied over from run_glue.py
    if args.model == 'bert':
        args.max_grad_norm = 1.0
        args.adam_epsilon = 1e-8
        args.warmup_steps = 0

    if os.path.exists(args.log_dir) and args.resume:
        resume = True
        mode = 'a'
    else:
        resume = False
        mode = 'w'

    ## Initialize logs
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    logger = Logger(os.path.join(args.log_dir, 'log.txt'), mode)
    # Record args
    log_args(args, logger)

    set_seed(args.seed)
    print("starting prep")
    # Data
    # Test data for label_shift_step is not implemented yet
    test_data = None
    test_loader = None
    if args.shift_type == 'confounder':
        train_data, val_data, test_data = prepare_data(args, train=True)
    elif args.shift_type == 'label_shift_step':
        train_data, val_data = prepare_data(args, train=True)
    print("done prep")
    loader_kwargs = {
        'batch_size': args.batch_size,
        'num_workers': 16,
        'pin_memory': True
    }
    train_loader = train_data.get_loader(train=True,
                                         reweight_groups=args.reweight_groups,
                                         **loader_kwargs)
    val_loader = val_data.get_loader(train=False,
                                     reweight_groups=None,
                                     **loader_kwargs)
    if test_data is not None:
        test_loader = test_data.get_loader(train=False,
                                           reweight_groups=None,
                                           **loader_kwargs)

    data = {}
    data['train_loader'] = train_loader
    data['val_loader'] = val_loader
    data['test_loader'] = test_loader
    data['train_data'] = train_data
    data['val_data'] = val_data
    data['test_data'] = test_data
    n_classes = train_data.n_classes

    log_data(data, logger)
    logger.flush()

    ## Define the objective
    if args.hinge:
        assert args.dataset in ['CelebA', 'CUB']  # Only supports binary

        def hinge_loss(yhat, y):
            # The torch loss takes in three arguments so we need to split yhat
            # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1}
            # Furthermore, if y = 1 it expects the first input to be higher instead of the second,
            # so we need to swap yhat[:, 0] and yhat[:, 1]...
            torch_loss = torch.nn.MarginRankingLoss(margin=1.0,
                                                    reduction='none')
            y = (y.float() * 2.0) - 1.0
            return torch_loss(yhat[:, 1], yhat[:, 0], y)

        criterion = hinge_loss
    else:
        criterion = torch.nn.CrossEntropyLoss(reduction='none')

    if resume:
        df = pd.read_csv(os.path.join(args.log_dir, 'test.csv'))
        epoch_offset = df.loc[len(df) - 1, 'epoch'] + 1
        logger.write(f'starting from epoch {epoch_offset}')
    else:
        epoch_offset = 0

    train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'),
                                      train_data.n_groups,
                                      mode=mode)
    val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'),
                                    train_data.n_groups,
                                    mode=mode)
    test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'),
                                     train_data.n_groups,
                                     mode=mode)
    strain_csv_logger = CSVBatchLogger(os.path.join(args.log_dir,
                                                    'strain.csv'),
                                       train_data.n_groups,
                                       mode=mode)
    sval_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'sval.csv'),
                                     train_data.n_groups,
                                     mode=mode)
    stest_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'stest.csv'),
                                      train_data.n_groups,
                                      mode=mode)

    teacher = resnet10vw(teacher_width, num_classes=n_classes)
    teacher_old = torch.load(teacher_dir + "/10_model.pth")
    for k, m in teacher_old.named_modules():
        m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatability
    teacher.load_state_dict(teacher_old.state_dict())
    teacher = teacher.to('cuda')
    #    def DistillationLoss(temperature):
    #        cross_entropy = torch.nn.CrossEntropyLoss()
    #
    #        def loss(student_logits, teacher_logits, target):
    #            last_dim = len(student_logits.shape) - 1
    #
    #            p_t = nn.functional.softmax(teacher_logits/temperature, dim=last_dim)
    #            log_p_s = nn.functional.log_softmax(student_logits/temperature, dim=last_dim)
    #
    #            return cross_entropy(student_logits, target) - (p_t * log_p_s).sum(dim=last_dim).mean()
    #
    #        return loss

    distill_criterion = DistillationLoss(float(temp))
    student = resnet10vw(int(student_width), num_classes=n_classes).to('cuda')

    #student.to(device)
    train(teacher,
          student,
          criterion,
          distill_criterion,
          data,
          logger,
          train_csv_logger,
          val_csv_logger,
          test_csv_logger,
          strain_csv_logger,
          sval_csv_logger,
          test_csv_logger,
          args,
          epoch_offset=epoch_offset)
    train_csv_logger.close()
    val_csv_logger.close()
    test_csv_logger.close()
    strain_csv_logger.close()
    sval_csv_logger.close()
    stest_csv_logger.close()
示例#5
0
def main():

    ti = time.time()

    args = parser.parse_args()
    if args.flocking_method == 'reynolds':
        from reynolds import Controller
    else:
        print("Wrong flocking controller specified.")
        sys.exit(1)
    from leader_controller import Leader_Controller

    path = os.path.join(args.optim_path, args.log_dir)
    timestamp = utils.log_args(path, args)
    logger = utils.get_logger(path, timestamp)

    flock = Flock(args)

    utils.log_init_state(logger, flock)

    controller_list = []
    leader_controller_list = []

    for drone in flock.drones:
        if drone.vehicle_name in flock.leader_list:
            controller = Leader_Controller(drone, flock.flock_list, args)
            leader_controller_list.append(controller)
        else:
            controller = Controller(drone, flock.flock_list, args)
        controller_list.append(controller)

    #airsim.wait_key('Press any key to takeoff')
    print("Taking-off")
    flock.take_off()

    #airsim.wait_key('Press any key to go to different altitudes')
    print("Going to different altitudes")
    flock.initial_altitudes()

    #airsim.wait_key('Press any key to start initial motion')
    print("Starting random motion")
    flock.initial_speeds()

    #airsim.wait_key('Press any key to start flocking')
    print("Now flocking")
    count = 0

    first_drone_name = flock.drones[0].vehicle_name
    init_sim_time = flock.client.getMultirotorState(
        vehicle_name=first_drone_name).timestamp

    while True:
        for controller in controller_list:
            controller.step()
        if count % 1 == 0:
            flock.log_flock_kinematics(logger, count)

        count += 1
        pygame.display.set_mode((1, 1))
        pygame.event.pump()
        keys = pygame.key.get_pressed()
        if keys[K_ESCAPE]:
            flock.reset()
            break
        curr_sim_time = flock.client.getMultirotorState(
            vehicle_name=first_drone_name).timestamp
        if (curr_sim_time -
                init_sim_time) / 1e9 / 60 > args.single_sim_duration:
            tf = time.time()
            print("Real world time, ", (tf - ti) / 60)
            flock.reset()
            break
def main():
    global LOGG
    (
        topics_file, labels_file, d2v_indices_file, w2v_indices_file,
        d2v_path, w2v_path, use_ftx,
        dataset, version, corpus_type, rerank,
        metrics, params, nbtopics, total_num_topics,
        max_title_length, min_doc_length, nb_labels, print_sample, args
    ) = parse_args()

    logger = init_logging(name=f'Labeling_{dataset}', basic=False, to_stdout=True, to_file=False)
    log_args(logger, args)
    LOGG = logger.info

    if topics_file is not None:
        topics = load_topics(
            topics_path=topics_file,
            metrics=metrics,
            params=params,
            nbtopics=nbtopics,
            print_sample=print_sample,
        )
    else:
        if rerank:
            topics = load('rerank', dataset, version, *params, *nbtopics, logger=logger)
            topics = topics.query('metric in @metrics')
            print(topics)
        else:
            topics = load('topics', dataset, version, corpus_type, *params, *nbtopics, logger=logger)

    d2v_docvecs, d2v_wv, w2v_wv = load_embeddings(
        d2v_path=d2v_path,
        w2v_path=w2v_path,
        use_ftx=use_ftx,
    )

    if d2v_indices_file and w2v_indices_file:
        with open(d2v_indices_file, 'rb') as fp:
            LOGG(f'Loading {d2v_indices_file}')
            d2v_indices = pickle.load(fp)
        with open(w2v_indices_file, 'rb') as fp:
            LOGG(f'Loading {w2v_indices_file}')
            w2v_indices = pickle.load(fp)
    else:
        d2v_indices, w2v_indices = get_indices(
            d2v_docvecs=d2v_docvecs,
            w2v_wv=w2v_wv,
            max_title_length=max_title_length,
            min_doc_length=min_doc_length
        )
    d2v_indices = sorted(set(d2v_indices))
    w2v_indices = sorted(set(w2v_indices))

    w2v_indexed = index_embeddings(
        d2v_docvecs=d2v_docvecs,
        d2v_wv=d2v_wv,
        w2v_wv=w2v_wv,
        d2v_indices=d2v_indices,
        w2v_indices=w2v_indices
    )

    t0 = time()
    labels = topics[:total_num_topics].apply(
        lambda row: get_labels(
            topic=row,
            nb_labels=nb_labels,
            d2v_docvecs=d2v_docvecs,
            d2v_wv=d2v_wv,
            w2v_wv=w2v_wv,
            w2v_indexed=w2v_indexed,
            d_indices=d2v_indices,
            w_indices=w2v_indices
        ),
        axis=1
    )
    t1 = int(time() - t0)
    LOGG(f"done in {t1//3600:02d}:{(t1//60) % 60:02d}:{t1 % 60:02d}")
    if print_sample:
        LOGG(f'\n{labels.head(10)}')

    # reformatting output files
    col2 = 'ftx' if use_ftx else 'w2v'
    col3 = 'comb_ftx' if use_ftx else 'comb'
    labels = (
        labels
        .apply(pd.Series)
        .rename(columns={0: 'd2v', 1: col2, 2: col3})
        .stack()
        .apply(pd.Series)
        .rename(columns=lambda x: f'label{x}')
    )
    if print_sample:
        LOGG(f'\n{labels.head(10)}')

    if exists(labels_file + '.csv'):
        labels_file = labels_file + '_' + str(time()) + '.csv'
    else:
        labels_file += '.csv'
    LOGG(f'Writing labels to {labels_file}')
    labels.to_csv(labels_file)
示例#7
0
def main():
    # --- arguments ---
    (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks,
     tfidf, args) = parse_args()

    model_class = 'LSImodel'
    _split_ = "_split" if use_callbacks else ""

    data_name = f'{dataset}_{version}_{tfidf}'
    data_dir = join(LDA_PATH, version, tfidf)

    # --- logging ---
    logger = init_logging(name=data_name,
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info
    log_args(logger, args)

    # --- load dict ---
    logg('Loading dictionary')
    data_file = join(data_dir, f'{data_name}.dict')
    dictionary = Dictionary.load(data_file)

    # --- load corpus ---
    logg('Loading corpus')
    data_file = join(data_dir, f'{data_name}.mm')
    corpus = MmCorpus(data_file)
    if cache_in_memory:
        logg('Reading corpus into RAM')
        corpus = list(corpus)
    if use_callbacks:
        train, test = split_corpus(corpus)
    else:
        train, test = corpus, []
    logg(f'size of... train_set={len(train)}, test_set={len(test)}')

    # --- train ---
    topn = 20
    columns = [f'term{x}'
               for x in range(topn)] + [f'weight{x}' for x in range(topn)]
    for nbtopics in nbs_topics:
        gc.collect()

        logg(f'Running {model_class} with {nbtopics} topics')
        model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary)

        model_dir = join(LSI_PATH, version, tfidf, f'{_split_}')
        model_path = join(model_dir,
                          f'{dataset}_{model_class}{_split_}_{nbtopics}')
        if not exists(model_dir):
            makedirs(model_dir)

        # --- save topics ---
        topics = model.show_topics(num_words=topn, formatted=False)
        topics = [list(chain(*zip(*topic[1]))) for topic in topics]
        topics = pd.DataFrame(topics, columns=columns)
        logg(f'Saving topics to {model_path}.csv')
        topics.to_csv(f'{model_path}.csv')

        # --- save model ---
        logg(f'Saving model to {model_path}')
        model.save(model_path)

    # --- done ---
    logg(f'\n'
         f'----- end -----\n'
         f'----- {dataset.upper()} -----\n'
         f'{"#" * 50}\n')
def main():
    (dataset, version, params, nbtopics, topn, cores, corpus_type,
     use_coherence, use_w2v, rerank, lsi, args) = parse_args()

    logger = init_logging(name=f'Eval_topics_{dataset}',
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    log_args(logger, args)
    logg = logger.info

    purpose = 'rerank' if rerank else 'topics'
    topics = load(purpose,
                  dataset,
                  version,
                  corpus_type,
                  lsi,
                  *params,
                  *nbtopics,
                  logg=logg)
    if topn > 0:
        topics = topics[:topn]
    else:
        topn = topics.shape[1]
    logg(f'number of topics: {topics.shape}')
    unique_topics = topics.drop_duplicates()
    logg(f'number of unique topics: {unique_topics.shape}')
    wiki_dict = load('dict', 'dewiki', 'unfiltered', logg=logg)

    dfs = []
    if use_coherence:
        dictionary = load('dict', dataset, version, corpus_type, logg=logg)
        corpus = load('corpus', dataset, version, corpus_type, logg=logg)
        texts = load('texts', dataset, version, logg=logg)

        df = eval_coherence(
            topics=unique_topics,
            dictionary=dictionary,
            corpus=corpus,
            texts=texts,
            keyed_vectors=None,
            metrics=None,
            window_size=None,
            suffix='',
            cores=cores,
            logg=logg,
            topn=topn,
        )
        del dictionary, corpus, texts
        gc.collect()
        dfs.append(df)

        wiki_texts = load('texts', 'dewiki', logg=logg)
        df = eval_coherence(
            topics=unique_topics,
            dictionary=wiki_dict,
            corpus=None,
            texts=wiki_texts,
            keyed_vectors=None,
            metrics=None,
            window_size=None,
            suffix='_wikt',
            cores=cores,
            logg=logg,
            topn=topn,
        )
        gc.collect()
        dfs.append(df)

        df = eval_coherence(
            unique_topics,
            wiki_dict,
            corpus=None,
            texts=wiki_texts,
            keyed_vectors=None,
            metrics=['c_uci'],
            window_size=20,
            suffix='_wikt_w20',
            cores=cores,
            logg=logg,
            topn=topn,
        )
        del wiki_texts
        gc.collect()
        dfs.append(df)

    df_sims = None
    if use_w2v:
        d2v = load('d2v', logg=logg).docvecs
        w2v = load('w2v', logg=logg).wv
        ftx = load('ftx', logg=logg).wv
        # Dry run to make sure both indices are fully in RAM
        d2v.init_sims()
        _ = d2v.vectors_docs_norm[0]
        w2v.init_sims()
        _ = w2v.vectors_norm[0]
        ftx.init_sims()
        _ = ftx.vectors_norm[0]

        df = eval_coherence(
            topics=unique_topics,
            dictionary=wiki_dict,
            corpus=None,
            texts=None,
            keyed_vectors=w2v,
            metrics=None,
            window_size=None,
            suffix='_w2v',
            cores=cores,
            logg=logger.info,
            topn=topn,
        )
        gc.collect()
        dfs.append(df)

        df = eval_coherence(
            topics=unique_topics,
            dictionary=wiki_dict,
            corpus=None,
            texts=None,
            keyed_vectors=ftx,
            metrics=None,
            window_size=None,
            suffix='_ftx',
            cores=cores,
            logg=logger.info,
            topn=topn,
        )
        gc.collect()
        dfs.append(df)

        # apply custom similarity metrics
        kvs = {'d2v': d2v, 'w2v': w2v, 'ftx': ftx}
        ms = unique_topics.apply(lambda x: mean_similarity(x, kvs), axis=1)
        ps = unique_topics.apply(
            lambda x: pairwise_similarity(x, kvs, ignore_oov=True), axis=1)
        ps2 = unique_topics.apply(
            lambda x: pairwise_similarity(x, kvs, ignore_oov=False), axis=1)
        df_sims = pd.concat(
            {
                'mean_similarity': ms,
                'pairwise_similarity_ignore_oov': ps,
                'pairwise_similarity': ps2
            },
            axis=1)
        del d2v, w2v, ftx
        gc.collect()

    dfs = pd.concat(dfs, axis=1)
    dfs = dfs.stack().apply(pd.Series).rename(columns={
        0: 'score',
        1: 'stdev',
        2: 'support'
    }).unstack()
    if df_sims is not None:
        dfs = pd.concat([dfs, df_sims], axis=1)

    # restore scores for all topics from results of unique topics
    topics.columns = pd.MultiIndex.from_tuples([('terms', t)
                                                for t in list(topics.columns)])
    topic_columns = list(topics.columns)
    fillna = lambda grp: grp.fillna(method='ffill') if len(grp) > 1 else grp
    dfs = (topics.join(dfs).groupby(topic_columns).apply(fillna).drop(
        topic_columns, axis=1))

    tpx_path = join(LDA_PATH, version, 'bow', 'topics')
    if rerank:
        file = join(tpx_path, f'{dataset}_reranker-eval.csv')
    else:
        file = join(
            tpx_path,
            f'{dataset}{"_"+lsi if lsi else ""}_{version}_{corpus_type}_topic-scores.csv'
        )
    if exists(file):
        file = file.replace('.csv', f'_{str(time()).split(".")[0]}.csv')

    logg(f'Writing {file}')
    dfs.to_csv(file)
    logg('done')

    return dfs
示例#9
0
def main():
    parser = argparse.ArgumentParser()

    # Settings
    parser.add_argument('-d',
                        '--dataset',
                        choices=dataset_attributes.keys(),
                        required=True)
    parser.add_argument('-s',
                        '--shift_type',
                        choices=shift_types,
                        required=True)
    # Confounders
    parser.add_argument('-t', '--target_name')
    parser.add_argument('-c', '--confounder_names', nargs='+')
    # Resume?
    parser.add_argument('--resume', default=False, action='store_true')
    # Label shifts
    parser.add_argument('--minority_fraction', type=float)
    parser.add_argument('--imbalance_ratio', type=float)
    # Data
    parser.add_argument('--fraction', type=float, default=1.0)
    parser.add_argument('--root_dir', default=None)
    parser.add_argument('--subsample_to_minority',
                        action='store_true',
                        default=False)
    parser.add_argument('--reweight_groups',
                        action='store_true',
                        default=False)
    parser.add_argument('--augment_data', action='store_true', default=False)
    parser.add_argument('--val_fraction', type=float, default=0.1)
    # Objective
    parser.add_argument('--robust', default=False, action='store_true')
    parser.add_argument('--alpha', type=float, default=0.2)
    parser.add_argument('--generalization_adjustment', default="0.0")
    parser.add_argument('--automatic_adjustment',
                        default=False,
                        action='store_true')
    parser.add_argument('--robust_step_size', default=0.01, type=float)
    parser.add_argument('--use_normalized_loss',
                        default=False,
                        action='store_true')
    parser.add_argument('--btl', default=False, action='store_true')
    parser.add_argument('--hinge', default=False, action='store_true')

    # Model
    parser.add_argument('--model',
                        choices=model_attributes.keys(),
                        default='resnet50')
    parser.add_argument('--train_from_scratch',
                        action='store_true',
                        default=False)
    parser.add_argument('--resnet_width', type=int, default=None)

    # Optimization
    parser.add_argument('--n_epochs', type=int, default=4)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--scheduler', action='store_true', default=False)
    parser.add_argument('--weight_decay', type=float, default=5e-5)
    parser.add_argument('--gamma', type=float, default=0.1)
    parser.add_argument('--minimum_variational_weight', type=float, default=0)
    # Misc
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--show_progress', default=False, action='store_true')
    parser.add_argument('--log_dir', default='./logs')
    parser.add_argument('--log_every', default=50, type=int)
    parser.add_argument('--save_step', type=int, default=10)
    parser.add_argument('--save_best', action='store_true', default=False)
    parser.add_argument('--save_last', action='store_true', default=False)
    parser.add_argument('--model_test', type=str)
    parser.add_argument('--gpu', type=str)

    args = parser.parse_args()
    check_args(args)
    model_test = args.model_test
    gpu = args.gpu
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu

    # BERT-specific configs copied over from run_glue.py
    if args.model == 'bert':
        args.max_grad_norm = 1.0
        args.adam_epsilon = 1e-8
        args.warmup_steps = 0

    if os.path.exists(args.log_dir) and args.resume:
        resume = True
        mode = 'a'
    else:
        resume = False
        mode = 'w'

    ## Initialize logs
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    logger = Logger(os.path.join(args.log_dir, model_test + '_log.txt'), mode)
    # Record args
    log_args(args, logger)

    set_seed(args.seed)

    # Data
    # Test data for label_shift_step is not implemented yet
    test_data = None
    test_loader = None
    if args.shift_type == 'confounder':
        train_data, val_data, test_data = prepare_data(args, train=True)
    elif args.shift_type == 'label_shift_step':
        train_data, val_data = prepare_data(args, train=True)

    loader_kwargs = {
        'batch_size': args.batch_size,
        'num_workers': 12,
        'pin_memory': True
    }
    train_loader = train_data.get_loader(train=True,
                                         reweight_groups=args.reweight_groups,
                                         **loader_kwargs)
    val_loader = val_data.get_loader(train=False,
                                     reweight_groups=None,
                                     **loader_kwargs)
    if test_data is not None:
        test_loader = test_data.get_loader(train=False,
                                           reweight_groups=None,
                                           **loader_kwargs)

    data = {}
    data['train_loader'] = train_loader
    data['val_loader'] = val_loader
    data['test_loader'] = test_loader
    data['train_data'] = train_data
    data['val_data'] = val_data
    data['test_data'] = test_data
    n_classes = train_data.n_classes

    log_data(data, logger)

    ## Initialize model
    pretrained = not args.train_from_scratch
    if resume:
        model = torch.load(os.path.join(args.log_dir, model_test))
        d = train_data.input_size()[0]
    elif model_attributes[args.model]['feature_type'] in ('precomputed',
                                                          'raw_flattened'):
        assert pretrained
        # Load precomputed features
        d = train_data.input_size()[0]
        model = nn.Linear(d, n_classes)
        model.has_aux_logits = False
    elif args.model == 'resnet50':
        model = torchvision.models.resnet50(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif args.model == 'resnet34':
        model = torchvision.models.resnet34(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif args.model == 'wideresnet50':
        model = torchvision.models.wide_resnet50_2(pretrained=pretrained)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
    elif args.model == 'resnet50vw':
        assert not pretrained
        assert args.resnet_width is not None
        model = resnet50vw(args.resnet_width, num_classes=n_classes)
    elif args.model == 'resnet18vw':
        assert not pretrained
        assert args.resnet_width is not None
        model = resnet18vw(args.resnet_width, num_classes=n_classes)
    elif args.model == 'resnet10vw':
        assert not pretrained
        assert args.resnet_width is not None
        model = resnet10vw(args.resnet_width, num_classes=n_classes)
    elif args.model == 'bert':
        assert args.dataset == 'MultiNLI'

        from pytorch_transformers import BertConfig, BertForSequenceClassification
        config_class = BertConfig
        model_class = BertForSequenceClassification

        config = config_class.from_pretrained('bert-base-uncased',
                                              num_labels=3,
                                              finetuning_task='mnli')
        model = model_class.from_pretrained('bert-base-uncased',
                                            from_tf=False,
                                            config=config)
    else:
        raise ValueError('Model not recognized.')

    logger.flush()

    ## Define the objective
    if args.hinge:
        assert args.dataset in ['CelebA', 'CUB']  # Only supports binary

        def hinge_loss(yhat, y):
            # The torch loss takes in three arguments so we need to split yhat
            # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1}
            # Furthermore, if y = 1 it expects the first input to be higher instead of the second,
            # so we need to swap yhat[:, 0] and yhat[:, 1]...
            torch_loss = torch.nn.MarginRankingLoss(margin=1.0,
                                                    reduction='none')
            y = (y.float() * 2.0) - 1.0
            return torch_loss(yhat[:, 1], yhat[:, 0], y)

        criterion = hinge_loss
    else:
        criterion = torch.nn.CrossEntropyLoss(reduction='none')

    if False:
        df = pd.read_csv(os.path.join(args.log_dir, 'test.csv'))
        epoch_offset = df.loc[len(df) - 1, 'epoch'] + 1
        logger.write(f'starting from epoch {epoch_offset}')
    else:
        epoch_offset = 0
    train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'),
                                      train_data.n_groups,
                                      mode=mode)
    val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'),
                                    train_data.n_groups,
                                    mode=mode)
    test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'),
                                     train_data.n_groups,
                                     mode=mode)

    train(model,
          criterion,
          data,
          logger,
          train_csv_logger,
          val_csv_logger,
          test_csv_logger,
          args,
          epoch_offset=epoch_offset)

    train_csv_logger.close()
    val_csv_logger.close()
    test_csv_logger.close()
示例#10
0
def main():
    args = argparser.parse_args()
    log_args(args)

    input_dir = args.input_dir
    output_dir = args.output_dir
    base_model_dir = args.base_model_dir
    image_size = args.image_size
    crop_images = args.crop_images
    augment = args.augment
    use_progressive_image_sizes = args.use_progressive_image_sizes
    progressive_image_size_min = args.progressive_image_size_min
    progressive_image_size_step = args.progressive_image_size_step
    progressive_image_epoch_step = args.progressive_image_epoch_step
    batch_size = args.batch_size
    batch_iterations = args.batch_iterations
    num_workers = args.num_workers
    pin_memory = args.pin_memory
    epochs_to_train = args.epochs
    lr_scheduler_type = args.lr_scheduler
    lr_patience = args.lr_patience
    lr_min = args.lr_min
    lr_max = args.lr_max
    lr_min_decay = args.lr_min_decay
    lr_max_decay = args.lr_max_decay
    optimizer_type = args.optimizer
    loss_type = args.loss
    focal_loss_gamma = args.focal_loss_gamma
    use_class_weights = args.use_class_weights
    use_weighted_sampling = args.use_weighted_sampling
    model_type = args.model
    patience = args.patience
    sgdr_cycle_epochs = args.sgdr_cycle_epochs
    sgdr_cycle_epochs_mult = args.sgdr_cycle_epochs_mult
    sgdr_cycle_end_prolongation = args.sgdr_cycle_end_prolongation
    sgdr_cycle_end_patience = args.sgdr_cycle_end_patience
    max_sgdr_cycles = args.max_sgdr_cycles

    if optimizer_type == "adam":
        lr_scheduler_type = "adam"

    progressive_image_sizes = list(
        range(progressive_image_size_min, image_size + 1,
              progressive_image_size_step))

    train_data = TrainData(input_dir)

    train_set = TrainDataset(train_data.train_set_df, input_dir, 28,
                             image_size, crop_images, augment)

    balance_weights, balance_class_weights = calculate_balance_weights(
        train_data.df, train_data.train_set_df, 28)
    train_set_sampler = WeightedRandomSampler(balance_weights,
                                              len(balance_weights))

    train_set_data_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=False if use_weighted_sampling else True,
        sampler=train_set_sampler if use_weighted_sampling else None,
        num_workers=num_workers,
        pin_memory=pin_memory)

    val_set = TrainDataset(train_data.val_set_df, input_dir, 28, image_size,
                           crop_images, False)
    val_set_data_loader = \
        DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

    if base_model_dir:
        for base_file_path in glob.glob("{}/*.pth".format(base_model_dir)):
            shutil.copyfile(
                base_file_path,
                "{}/{}".format(output_dir, os.path.basename(base_file_path)))
        model = create_model(type=model_type, num_classes=28).to(device)
        model.load_state_dict(
            torch.load("{}/model.pth".format(output_dir), map_location=device))
        optimizer = create_optimizer(optimizer_type, model, lr_max)
        if os.path.isfile("{}/optimizer.pth".format(output_dir)):
            try:
                optimizer.load_state_dict(
                    torch.load("{}/optimizer.pth".format(output_dir)))
                adjust_initial_learning_rate(optimizer, lr_max)
                adjust_learning_rate(optimizer, lr_max)
            except:
                log("Failed to load the optimizer weights")
    else:
        model = create_model(type=model_type, num_classes=28).to(device)
        optimizer = create_optimizer(optimizer_type, model, lr_max)

    torch.save(model.state_dict(), "{}/model.pth".format(output_dir))

    ensemble_model_index = 0
    for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)):
        model_file_name = os.path.basename(model_file_path)
        model_index = int(
            model_file_name.replace("model-", "").replace(".pth", ""))
        ensemble_model_index = max(ensemble_model_index, model_index + 1)

    epoch_iterations = ceil(len(train_set) / batch_size)

    log("train_set_samples: {}, val_set_samples: {}".format(
        len(train_set), len(val_set)))
    log()

    global_val_score_best_avg = float("-inf")
    sgdr_cycle_val_score_best_avg = float("-inf")

    lr_scheduler = CosineAnnealingLR(optimizer,
                                     T_max=sgdr_cycle_epochs,
                                     eta_min=lr_min)

    optim_summary_writer = SummaryWriter(
        log_dir="{}/logs/optim".format(output_dir))
    train_summary_writer = SummaryWriter(
        log_dir="{}/logs/train".format(output_dir))
    val_summary_writer = SummaryWriter(
        log_dir="{}/logs/val".format(output_dir))

    current_sgdr_cycle_epochs = sgdr_cycle_epochs
    sgdr_next_cycle_end_epoch = current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation
    sgdr_iterations = 0
    sgdr_cycle_count = 0
    batch_count = 0
    epoch_of_last_improval = 0

    lr_scheduler_plateau = \
        ReduceLROnPlateau(optimizer, mode="max", min_lr=lr_min, patience=lr_patience, factor=0.5, threshold=1e-4)

    lr_scheduler_step = StepLR(optimizer, step_size=10, gamma=0.1)

    log('{"chart": "best_val_score", "axis": "epoch"}')
    log('{"chart": "val_score", "axis": "epoch"}')
    log('{"chart": "val_loss", "axis": "epoch"}')
    log('{"chart": "sgdr_cycle", "axis": "epoch"}')
    log('{"chart": "score", "axis": "epoch"}')
    log('{"chart": "loss", "axis": "epoch"}')
    log('{"chart": "lr_scaled", "axis": "epoch"}')
    log('{"chart": "mem_used", "axis": "epoch"}')
    log('{"chart": "epoch_time", "axis": "epoch"}')

    train_start_time = time.time()

    loss_weight = CLASS_WEIGHTS_TENSOR if use_class_weights else None
    criterion = create_criterion(loss_type, loss_weight, focal_loss_gamma)

    for epoch in range(epochs_to_train):
        epoch_start_time = time.time()

        log("memory used: {:.2f} GB".format(psutil.virtual_memory().used /
                                            2**30))

        if use_progressive_image_sizes:
            next_image_size = \
                progressive_image_sizes[min(epoch // progressive_image_epoch_step, len(progressive_image_sizes) - 1)]

            if train_set.image_size != next_image_size:
                log("changing image size to {}".format(next_image_size))
                train_set.image_size = next_image_size
                val_set.image_size = next_image_size

        model.train()

        train_loss_sum_t = zero_item_tensor()

        epoch_batch_iter_count = 0

        if lr_scheduler_type == "lr_finder":
            new_lr = lr_max * 0.5**(sgdr_cycle_epochs - min(
                sgdr_cycle_epochs, sgdr_iterations / epoch_iterations))
            adjust_learning_rate(optimizer, new_lr)

        all_predictions = []
        all_targets = []
        for b, batch in enumerate(train_set_data_loader):
            images, categories = \
                batch[0].to(device, non_blocking=True), \
                batch[1].to(device, non_blocking=True)

            if lr_scheduler_type == "cosine_annealing":
                lr_scheduler.step(
                    epoch=min(current_sgdr_cycle_epochs, sgdr_iterations /
                              epoch_iterations))

            if b % batch_iterations == 0:
                optimizer.zero_grad()

            prediction_logits = model(images)
            criterion.weight = CLASS_WEIGHTS_TENSOR
            loss = criterion(prediction_logits, categories)
            loss.backward()

            with torch.no_grad():
                train_loss_sum_t += loss
                all_predictions.extend(
                    torch.sigmoid(prediction_logits).cpu().data.numpy())
                all_targets.extend(categories.cpu().data.numpy())

            if (b + 1) % batch_iterations == 0 or (
                    b + 1) == len(train_set_data_loader):
                optimizer.step()

            sgdr_iterations += 1
            batch_count += 1
            epoch_batch_iter_count += 1

            optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer),
                                            batch_count + 1)

        train_loss_avg = train_loss_sum_t.item() / epoch_batch_iter_count
        train_score_avg = f1_score_from_probs(torch.tensor(all_predictions),
                                              torch.tensor(all_targets))

        val_loss_avg, val_score_avg = evaluate(model, val_set_data_loader,
                                               criterion)

        if lr_scheduler_type == "reduce_on_plateau":
            lr_scheduler_plateau.step(val_score_avg)
        elif lr_scheduler_type == "step":
            lr_scheduler_step.step(epoch)

        model_improved_within_sgdr_cycle = check_model_improved(
            sgdr_cycle_val_score_best_avg, val_score_avg)
        if model_improved_within_sgdr_cycle:
            torch.save(
                model.state_dict(),
                "{}/model-{}.pth".format(output_dir, ensemble_model_index))
            sgdr_cycle_val_score_best_avg = val_score_avg

        model_improved = check_model_improved(global_val_score_best_avg,
                                              val_score_avg)
        ckpt_saved = False
        if model_improved:
            torch.save(model.state_dict(), "{}/model.pth".format(output_dir))
            torch.save(optimizer.state_dict(),
                       "{}/optimizer.pth".format(output_dir))
            np.save("{}/train_predictions.npy".format(output_dir),
                    all_predictions)
            np.save("{}/train_targets.npy".format(output_dir), all_targets)
            global_val_score_best_avg = val_score_avg
            epoch_of_last_improval = epoch
            ckpt_saved = True

        sgdr_reset = False
        if (lr_scheduler_type == "cosine_annealing") \
                and (epoch + 1 >= sgdr_next_cycle_end_epoch) \
                and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience):
            sgdr_iterations = 0
            current_sgdr_cycle_epochs = int(current_sgdr_cycle_epochs *
                                            sgdr_cycle_epochs_mult)
            sgdr_next_cycle_end_epoch = epoch + 1 + current_sgdr_cycle_epochs + sgdr_cycle_end_prolongation

            ensemble_model_index += 1
            sgdr_cycle_val_score_best_avg = float("-inf")
            sgdr_cycle_count += 1
            sgdr_reset = True

            new_lr_min = lr_min * (lr_min_decay**sgdr_cycle_count)
            new_lr_max = lr_max * (lr_max_decay**sgdr_cycle_count)
            new_lr_max = max(new_lr_max, new_lr_min)

            adjust_learning_rate(optimizer, new_lr_max)
            lr_scheduler = CosineAnnealingLR(optimizer,
                                             T_max=current_sgdr_cycle_epochs,
                                             eta_min=new_lr_min)

        optim_summary_writer.add_scalar("sgdr_cycle", sgdr_cycle_count,
                                        epoch + 1)

        train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1)
        train_summary_writer.add_scalar("score", train_score_avg, epoch + 1)
        val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1)
        val_summary_writer.add_scalar("score", val_score_avg, epoch + 1)

        epoch_end_time = time.time()
        epoch_duration_time = epoch_end_time - epoch_start_time

        log("[%03d/%03d] %ds, lr: %.6f, loss: %.4f, val_loss: %.4f, score: %.4f, val_score: %.4f, ckpt: %d, rst: %d"
            %
            (epoch + 1, epochs_to_train, epoch_duration_time,
             get_learning_rate(optimizer), train_loss_avg, val_loss_avg,
             train_score_avg, val_score_avg, int(ckpt_saved), int(sgdr_reset)))

        log('{"chart": "best_val_score", "x": %d, "y": %.4f}' %
            (epoch + 1, global_val_score_best_avg))
        log('{"chart": "val_loss", "x": %d, "y": %.4f}' %
            (epoch + 1, val_loss_avg))
        log('{"chart": "val_score", "x": %d, "y": %.4f}' %
            (epoch + 1, val_score_avg))
        log('{"chart": "sgdr_cycle", "x": %d, "y": %d}' %
            (epoch + 1, sgdr_cycle_count))
        log('{"chart": "loss", "x": %d, "y": %.4f}' %
            (epoch + 1, train_loss_avg))
        log('{"chart": "score", "x": %d, "y": %.4f}' %
            (epoch + 1, train_score_avg))
        log('{"chart": "lr_scaled", "x": %d, "y": %.4f}' %
            (epoch + 1, 1000 * get_learning_rate(optimizer)))
        log('{"chart": "mem_used", "x": %d, "y": %.2f}' %
            (epoch + 1, psutil.virtual_memory().used / 2**30))
        log('{"chart": "epoch_time", "x": %d, "y": %d}' %
            (epoch + 1, epoch_duration_time))

        if (sgdr_reset or lr_scheduler_type in ("reduce_on_plateau", "step")) \
                and epoch - epoch_of_last_improval >= patience:
            log("early abort due to lack of improval")
            break

        if max_sgdr_cycles is not None and sgdr_cycle_count >= max_sgdr_cycles:
            log("early abort due to maximum number of sgdr cycles reached")
            break

    optim_summary_writer.close()
    train_summary_writer.close()
    val_summary_writer.close()

    train_end_time = time.time()
    log()
    log("Train time: %s" %
        str(datetime.timedelta(seconds=train_end_time - train_start_time)))

    model.load_state_dict(
        torch.load("{}/model.pth".format(output_dir), map_location=device))

    val_predictions, val_targets = predict(model, val_set_data_loader)
    np.save("{}/val_predictions.npy".format(output_dir), val_predictions)
    np.save("{}/val_targets.npy".format(output_dir), val_targets)

    best_threshold, best_threshold_score, all_threshold_scores = calculate_best_threshold(
        val_predictions, val_targets)
    log("All threshold scores: {}".format(all_threshold_scores))
    log("Best threshold / score: {} / {}".format(best_threshold,
                                                 best_threshold_score))

    test_data = TestData(input_dir)
    test_set = TestDataset(test_data.test_set_df, input_dir, image_size,
                           crop_images)
    test_set_data_loader = \
        DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

    test_predictions, _ = predict(model, test_set_data_loader)
    np.save("{}/test_predictions.npy".format(output_dir), test_predictions)

    predicted_categories = calculate_categories_from_predictions(
        test_predictions, threshold=best_threshold)

    submission_df = test_data.test_set_df.copy()
    submission_df["Predicted"] = [
        " ".join(map(str, pc)) for pc in predicted_categories
    ]
    submission_df.to_csv("{}/submission.csv".format(output_dir))
示例#11
0
def main(args):
    if args.wandb:
        wandb.init(project=f"{args.project_name}_{args.dataset}")
        wandb.config.update(args)

    # BERT-specific configs copied over from run_glue.py
    if (args.model.startswith("bert") and args.use_bert_params): 
        args.max_grad_norm = 1.0
        args.adam_epsilon = 1e-8
        args.warmup_steps = 0

    if os.path.exists(args.log_dir) and args.resume:
        resume = True
        mode = "a"
    else:
        resume = False
        mode = "w"

    ## Initialize logs
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    logger = Logger(os.path.join(args.log_dir, "log.txt"), mode)
    # Record args
    log_args(args, logger)

    set_seed(args.seed)

    # Data
    # Test data for label_shift_step is not implemented yet
    test_data = None
    test_loader = None
    if args.shift_type == "confounder":
        train_data, val_data, test_data = prepare_data(
            args,
            train=True,
        )

    elif args.shift_type == "label_shift_step":
        raise NotImplementedError
        train_data, val_data = prepare_data(args, train=True)

    #########################################################################
    ###################### Prepare data for our method ######################
    #########################################################################

    # Should probably not be upweighting if folds are specified.
    assert not args.fold or not args.up_weight

    # Fold passed. Use it as train and valid.
    if args.fold:
        train_data, val_data = folds.get_fold(
            train_data,
            args.fold,
            cross_validation_ratio=(1 / args.num_folds_per_sweep),
            num_valid_per_point=args.num_sweeps,
            seed=args.seed,
        )

    if args.up_weight != 0:
        assert args.aug_col is not None
        # Get points that should be upsampled
        metadata_df = pd.read_csv(args.metadata_path)
        if args.dataset == "jigsaw":
            train_col = metadata_df[metadata_df["split"] == "train"]
        else:
            train_col = metadata_df[metadata_df["split"] == 0]
        aug_indices = np.where(train_col[args.aug_col] == 1)[0]
        print("len", len(train_col), len(aug_indices))
        if args.up_weight == -1:
            up_weight_factor = int(
                (len(train_col) - len(aug_indices)) / len(aug_indices)) - 1
        else:
            up_weight_factor = args.up_weight

        print(f"Up-weight factor: {up_weight_factor}")
        upsampled_points = Subset(train_data,
                                  list(aug_indices) * up_weight_factor)
        # Convert to DRODataset
        train_data = dro_dataset.DRODataset(
            ConcatDataset([train_data, upsampled_points]),
            process_item_fn=None,
            n_groups=train_data.n_groups,
            n_classes=train_data.n_classes,
            group_str_fn=train_data.group_str,
        )
    elif args.aug_col is not None:
        print("\n"*2 + "WARNING: aug_col is not being used." + "\n"*2)

    #########################################################################
    #########################################################################
    #########################################################################

    loader_kwargs = {
        "batch_size": args.batch_size,
        "num_workers": 4,
        "pin_memory": True,
    }
    train_loader = dro_dataset.get_loader(train_data,
                                          train=True,
                                          reweight_groups=args.reweight_groups,
                                          **loader_kwargs)

    val_loader = dro_dataset.get_loader(val_data,
                                        train=False,
                                        reweight_groups=None,
                                        **loader_kwargs)

    if test_data is not None:
        test_loader = dro_dataset.get_loader(test_data,
                                             train=False,
                                             reweight_groups=None,
                                             **loader_kwargs)

    data = {}
    data["train_loader"] = train_loader
    data["val_loader"] = val_loader
    data["test_loader"] = test_loader
    data["train_data"] = train_data
    data["val_data"] = val_data
    data["test_data"] = test_data

    n_classes = train_data.n_classes

    log_data(data, logger)

    ## Initialize model
    model = get_model(
        model=args.model,
        pretrained=not args.train_from_scratch,
        resume=resume,
        n_classes=train_data.n_classes,
        dataset=args.dataset,
        log_dir=args.log_dir,
    )
    if args.wandb:
        wandb.watch(model)

    logger.flush()

    ## Define the objective
    if args.hinge:
        assert args.dataset in ["CelebA", "CUB"]  # Only supports binary
        criterion = hinge_loss
    else:
        criterion = torch.nn.CrossEntropyLoss(reduction="none")

    if resume:
        raise NotImplementedError  # Check this implementation.
        df = pd.read_csv(os.path.join(args.log_dir, "test.csv"))
        epoch_offset = df.loc[len(df) - 1, "epoch"] + 1
        logger.write(f"starting from epoch {epoch_offset}")
    else:
        epoch_offset = 0

    
    train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"train.csv"),
                                      train_data.n_groups,
                                      mode=mode)
    val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"val.csv"),
                                    val_data.n_groups,
                                    mode=mode)
    test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, f"test.csv"),
                                     test_data.n_groups,
                                     mode=mode)
    train(
        model,
        criterion,
        data,
        logger,
        train_csv_logger,
        val_csv_logger,
        test_csv_logger,
        args,
        epoch_offset=epoch_offset,
        csv_name=args.fold,
        wandb=wandb if args.wandb else None,
    )

    train_csv_logger.close()
    val_csv_logger.close()
    test_csv_logger.close()
示例#12
0
    parser.add_argument("--mode", "-m", choices=VALID_MODES, required=True)
    parser.add_argument("--verbosity",
                        "-v",
                        type=int,
                        default=int(logging.INFO),
                        help="""
                        Verbosity levels in python: 
                            NOTSET = 0
                            DEBUG = 10
                            INFO = 20
                            WARNING = 30 
                            WARN = WARNING
                            ERROR = 40 
                            CRITICAL = 50 
                            FATAL = CRITICAL         
                        """)
    # TODO: add verbosity so we know the book being filtered

    args = parser.parse_args()
    # generate_plaintext_corpus(args)
    logging.basicConfig(format='%(message)s')
    utils.log_args(args)
    logging.getLogger().setLevel(args.verbosity)
    """
    TODO:
    - apache spark?    
    - add some form of logging for each book
    """

    generate_textid_corpus(args)
def main():
    (dataset, version, params, nbtopics, topn, cores, corpus_type,
     use_coherence, use_w2v, rerank, lsi, args) = parse_args()

    logger = init_logging(name=f'Eval_topics_on_germanet_{dataset}',
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    log_args(logger, args)
    logg = logger.info

    purpose = 'rerank' if rerank else 'topics'
    topics = load(purpose, dataset, version, corpus_type, lsi, *params,
                  *nbtopics)
    if topn > 0:
        topics = topics[:topn]
    else:
        topn = topics.shape[1]
    logg(f'Number of topics {topics.shape}')

    logg('Getting SynSets for topic terms')
    sstopics = topics.applymap(gn.synsets)

    topics['lch'] = sstopics.progress_apply(similarities,
                                            axis=1,
                                            sim_func=Synset.sim_lch,
                                            agg_func=max,
                                            topn=topn)
    topics['lch_ignr_unkwn'] = sstopics.progress_apply(similarities,
                                                       axis=1,
                                                       sim_func=Synset.sim_lch,
                                                       agg_func=max,
                                                       topn=topn,
                                                       ignore_unknown=False)
    topics['res'] = sstopics.progress_apply(similarities,
                                            axis=1,
                                            sim_func=Synset.sim_res,
                                            agg_func=max,
                                            topn=topn)
    topics['res_ignr_unkwn'] = sstopics.progress_apply(similarities,
                                                       axis=1,
                                                       sim_func=Synset.sim_res,
                                                       agg_func=max,
                                                       topn=topn,
                                                       ignore_unknown=False)
    topics['jcn'] = sstopics.progress_apply(similarities,
                                            axis=1,
                                            sim_func=Synset.dist_jcn,
                                            agg_func=min,
                                            topn=topn)
    topics['jcn_ignr_unkwn'] = sstopics.progress_apply(
        similarities,
        axis=1,
        sim_func=Synset.dist_jcn,
        agg_func=min,
        topn=topn,
        ignore_unknown=False)
    topics['lin'] = sstopics.progress_apply(similarities,
                                            axis=1,
                                            sim_func=Synset.sim_lin,
                                            agg_func=max,
                                            topn=topn)
    topics['lin_ignr_unkwn'] = sstopics.progress_apply(similarities,
                                                       axis=1,
                                                       sim_func=Synset.sim_lin,
                                                       agg_func=max,
                                                       topn=topn,
                                                       ignore_unknown=False)

    topics = topics.iloc[:, topn:]
    tpx_path = join(LDA_PATH, version, 'bow', 'topics')
    if rerank:
        file = join(tpx_path, f'{dataset}_reranker-eval_germanet.csv')
    else:
        file = join(
            tpx_path,
            f'{dataset}{"_"+lsi if lsi else ""}_{version}_{corpus_type}_topic-scores_germanet.csv'
        )
    if exists(file):
        file = file.replace('.csv', f'_{str(time()).split(".")[0]}.csv')

    logg(f'Writing {file}')
    topics.to_csv(file)
    logg('done')
示例#14
0
def main():
    global LOGG

    # --- arguments ---
    (dataset, version, cb_logger, params, nbs_topics, epochs, cores,
     cache_in_memory, use_callbacks, corpus_type, args) = parse_args()

    model_class = 'LDAmodel'
    _split = "_split" if use_callbacks else ""

    # --- logging ---
    logger = init_logging(
        name=f'LDA_{dataset}_{version}_{corpus_type}{_split}_ep{epochs}',
        basic=False,
        to_stdout=True,
        to_file=True)
    LOGG = logger.info
    log_args(logger, args)

    # --- load texts ---
    if use_callbacks:
        texts = load(dataset, version, 'texts', logger=logger)
    else:
        texts = []

    # --- load dict ---
    dictionary = load(dataset, version, corpus_type, 'dict', logger=logger)

    # --- load corpus ---
    corpus = load(dataset, version, corpus_type, 'corpus', logger=logger)
    if cache_in_memory:
        LOGG('Reading corpus into RAM')
        corpus = list(corpus)
    if use_callbacks:
        train, test = split_corpus(corpus)
    else:
        train, test = corpus, []
    LOGG(f'size of... train_set={len(train)}, test_set={len(test)}')

    # --- enable visdom ---
    vis = None
    if cb_logger == 'visdom':
        try:
            import visdom
            vis = visdom.Visdom()
        except Exception as e:
            LOGG(e)
            cb_logger = 'shell'

    # --- train ---
    topn = 20
    columns = [f'term{x}'
               for x in range(topn)] + [f'weight{x}' for x in range(topn)]
    metrics = []
    for param in params:
        env_id = f"{dataset}-{model_class}"
        for nbtopics in nbs_topics:
            gc.collect()

            callbacks = init_callbacks(dataset=dataset,
                                       callback_logger=cb_logger,
                                       documents=texts,
                                       training_corpus=train,
                                       test_corpus=test,
                                       processes=cores,
                                       version=version,
                                       param=param,
                                       nbtopics=nbtopics,
                                       tfidf=corpus_type)
            if not use_callbacks:
                callbacks = callbacks[-1:]
            else:
                LOGG('Initializing Callbacks')

            kwargs = get_parameterset(train,
                                      dictionary,
                                      callbacks=callbacks,
                                      nbtopics=nbtopics,
                                      parametrization=param,
                                      epochs=epochs)

            LOGG(
                f'Running {model_class} {corpus_type} "{param}{_split}" with {nbtopics} topics'
            )
            model = LdaModel(**kwargs)
            gc.collect()

            model_dir = join(LDA_PATH, version, corpus_type,
                             f'{param}{_split}')
            model_path = join(
                model_dir,
                f'{dataset}_LDAmodel_{param}{_split}_{nbtopics}_ep{epochs}')
            if not exists(model_dir):
                makedirs(model_dir)

            # --- save topics ---
            topics = model.show_topics(num_topics=-1,
                                       num_words=topn,
                                       formatted=False)
            topics = [list(chain(*zip(*topic[1]))) for topic in topics]
            topics = pd.DataFrame(topics, columns=columns)
            LOGG(f'Saving topics to {model_path}.csv')
            topics.to_csv(f'{model_path}.csv')

            # --- save metrics ---
            current_metrics = model.metrics
            metrics.append(('env_id', current_metrics))
            with open(f'{model_path}_metrics.json', 'w') as fp:
                serializable_metrics = {}
                for k, v in current_metrics.items():
                    if k == dataset:
                        continue
                    if isinstance(v[0], np.ndarray):
                        serializable_metrics[k] = [x.tolist() for x in v]
                    else:
                        serializable_metrics[k] = [float(x) for x in v]
                LOGG(f'Saving metrics to {model_path}_metrics.json')
                json.dump(serializable_metrics, fp)

            # --- save model ---
            LOGG(f'Saving LDAmodel to {model_path}')
            model.callbacks = None
            model.save(model_path)

            # --- save visdom environment ---
            if vis is not None:
                vis.save([env_id])

            gc.collect()

    # --- done ---
    LOGG(f'\n'
         f'----- end -----\n'
         f'----- {dataset.upper()} -----\n'
         f'{"#" * 50}\n')
示例#15
0
                        action='store_true',
                        help='Call nn.DataParallel on model or not')
    parser.add_argument('--num_neg', default=None, type=int)

    args = parser.parse_args()
    assert args.num_neg is not None
    if args.model == 'resnet10vw':
        assert args.width is not None
    set_seed(args.seed)
    model_path, batch_size, epochs = args.model_path, args.batch_size, args.epochs

    log_level = logging.INFO if args.verbose else logging.DEBUG
    logger = utils.get_logger(
        name=__name__, filename=args.logpath,
        console_log_level=log_level)  # default we log everything to console
    log_args(args, logger)

    logger.info("Loading Data")
    train_data = STL10(root='data',
                       split='train',
                       transform=utils.train_transform)
    train_loader = DataLoader(train_data,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=2,
                              pin_memory=True)
    test_data = STL10(root='data',
                      split='test',
                      transform=utils.test_transform)
    test_loader = DataLoader(test_data,
                             batch_size=batch_size,
示例#16
0
def main():
    # --- argument parsing ---
    (
        model_name, epochs, min_count, cores, checkpoint_every,
        cache_in_memory, lowercase, fasttext, args
    ) = parse_args(default_model_name='w2v_default', default_epochs=100)

    # --- init logging ---
    logger = init_logging(name=model_name, basic=True, to_file=True, to_stdout=False)
    log_args(logger, args)

    input_dir = join(SMPL_PATH, 'dewiki')
    model_dir = join(EMB_PATH, model_name)
    if not exists(model_dir):
        makedirs(model_dir)
    logger.info('model dir: ' + model_dir)

    t0 = time()
    if cache_in_memory:
        # needs approx. 25GB of RAM
        logger.info('cache data in memory')
        sentences = [s for s in Sentences(input_dir, logger, lowercase=lowercase)]
    else:
        sentences = Sentences(input_dir, logger, use_file_cache=True, lowercase=lowercase)
    gc.collect()

    # Model initialization
    logger.info('Initializing new model')
    if fasttext:
        model = FastText(
            size=300,
            window=5,
            min_count=min_count,
            sample=1e-5,
            negative=5,
            sg=1,
            seed=42,
            iter=epochs,
            workers=cores,
            min_n=3,
            max_n=6,
        )
    else:
        model = Word2Vec(
            size=300,
            window=5,
            min_count=min_count,
            sample=1e-5,
            negative=5,
            sg=1,
            seed=42,
            iter=epochs,
            workers=cores,
        )
    logger.info('Building vocab')
    model.build_vocab(sentences, progress_per=100_000)

    # Model Training
    epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every)
    epoch_logger = EpochLogger(logger)

    logger.info('Training {:d} epochs'.format(epochs))
    model.train(
        sentences,
        total_examples=model.corpus_count,
        epochs=model.epochs,
        report_delay=60,
        callbacks=[epoch_logger, epoch_saver],
    )

    # saving model
    file_path = join(model_dir, model_name)
    logger.info('Writing model to ' + file_path)
    model.callbacks = ()
    model.save(file_path)

    t1 = int(time() - t0)
    logger.info("all done in {:02d}:{:02d}:{:02d}".format(t1//3600, (t1//60) % 60, t1 % 60))
示例#17
0
def init_logging():
    logger.info("Representativeness experiments running ...")
    logger.info("python3 " + " ".join(sys.argv))
    log_args(params)
示例#18
0
def main():
    # --- argument parsing ---
    (model_name, epochs, min_count, cores, checkpoint_every, cache_in_memory,
     lowercase, _, args) = parse_args(default_model_name='d2v',
                                      default_epochs=20)

    # --- init logging ---
    logger = init_logging(name=model_name,
                          basic=True,
                          to_file=True,
                          to_stdout=False)
    log_args(logger, args)

    input_dir = join(SMPL_PATH, 'dewiki')
    model_dir = join(EMB_PATH, model_name)
    if not exists(model_dir):
        makedirs(model_dir)
    logger.info('model dir: ' + model_dir)

    t0 = time()
    documents = Documents(input_dir=input_dir,
                          logger=logger,
                          lowercase=lowercase)
    if cache_in_memory:
        documents = list(documents)
    gc.collect()

    # Model initialization
    logger.info('Initializing new model')
    model = Doc2Vec(
        vector_size=300,
        window=15,
        min_count=20,
        sample=1e-5,
        negative=5,
        hs=0,
        dm=0,
        dbow_words=1,
        dm_concat=0,
        seed=42,
        epochs=epochs,
        workers=cores,
    )
    logger.info('Building vocab')
    model.build_vocab(documents)

    # Model Training
    epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every)
    epoch_logger = EpochLogger(logger)

    logger.info('Training {:d} epochs'.format(epochs))
    model.train(
        documents,
        total_examples=model.corpus_count,
        epochs=model.epochs,
        report_delay=60,
        callbacks=[epoch_logger, epoch_saver],
    )

    # saving model
    file_path = join(model_dir, model_name)
    logger.info('Writing model to ' + file_path)
    model.callbacks = ()
    model.save(file_path)

    t1 = int(time() - t0)
    logger.info("all done in {:02d}:{:02d}:{:02d}".format(
        t1 // 3600, (t1 // 60) % 60, t1 % 60))
示例#19
0
def main():
    args = make_parser().parse_args()
    log_args(args)

    global_step = tf.get_variable('global_step',
                                  initializer=0,
                                  trainable=False)
    training = tf.get_variable('training', initializer=False, trainable=False)

    with tf.name_scope('data_loading'):
        train_ds, test_ds = cifar10.make_dataset(args.dataset_path)
        train_ds, test_ds = (train_ds.shuffle(args.shuffle).batch(
            args.batch_size), test_ds.batch(args.batch_size))

    iter = tf.data.Iterator.from_structure((tf.float32, tf.int64),
                                           ((None, 32, 32, 3), (None)))

    x, y = iter.get_next()
    logits = densenet.densenet(x,
                               block_depth=args.block_depth,
                               growth_rate=args.growth_rate,
                               compression_factor=args.compression_factor,
                               bottleneck=True,
                               dropout=args.dropout,
                               weight_decay=args.weight_decay,
                               training=training)
    loss, update_loss = metrics.loss(logits=logits, labels=y)
    accuracy, update_accuracy = metrics.accuracy(logits=logits, labels=y)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        class_loss = objectives.loss(logits=logits,
                                     labels=y,
                                     top_k=args.hard_negatives)
        reg_loss = tf.losses.get_regularization_loss()
        train_step = tf.train.AdamOptimizer(args.learning_rate).minimize(
            class_loss + reg_loss, global_step=global_step)

    locals_init = tf.local_variables_initializer()

    train_init = tf.group(training.assign(True),
                          iter.make_initializer(train_ds), locals_init)
    test_init = tf.group(training.assign(False),
                         iter.make_initializer(test_ds), locals_init)

    with tf.name_scope('summary'):
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('accuracy', accuracy)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()

    with tf.Session() as sess, tf.summary.FileWriter(
            os.path.join(args.experiment_path, 'train'),
            sess.graph) as train_writer, tf.summary.FileWriter(
                os.path.join(args.experiment_path, 'test'),
                sess.graph) as test_writer:
        restore_path = tf.train.latest_checkpoint(args.experiment_path)
        if restore_path:
            print(warning('Restoring from checkpoint'))
            saver.restore(sess, restore_path)
        else:
            print(warning('Initializing'))
            sess.run(tf.global_variables_initializer())

        for epoch in range(args.epochs):
            sess.run(train_init)
            for _ in tqdm(count(), desc='training'):
                try:
                    _, step = sess.run([(train_step, update_loss,
                                         update_accuracy), global_step])
                except tf.errors.OutOfRangeError:
                    break

            print(success('epoch: {}, step: {}'.format(epoch, step)))

            l, a, summary = sess.run([loss, accuracy, merged])
            print(
                success('(train) loss: {:.4f}, accuracy: {:.2f}'.format(
                    l, a * 100)))
            train_writer.add_summary(summary, step)
            train_writer.flush()

            sess.run(test_init)
            for _ in tqdm(count(), desc='evaluation'):
                try:
                    _, step = sess.run([(update_loss, update_accuracy),
                                        global_step])
                except tf.errors.OutOfRangeError:
                    break

            l, a, summary = sess.run([loss, accuracy, merged])
            print(
                success('(test) loss: {:.4f}, accuracy: {:.2f}'.format(
                    l, a * 100)))
            test_writer.add_summary(summary, step)
            test_writer.flush()

            save_path = saver.save(
                sess, os.path.join(args.experiment_path, 'model.ckpt'))
            print(warning('model saved: {}'.format(save_path)))
示例#20
0
def main():
    print("Loading and checking args...")
    args = parse_args()
    check_args(args)
    # BERT-specific configs copied over from run_glue.py
    if args.model.startswith('bert'):
        args.max_grad_norm = 1.0
        args.adam_epsilon = 1e-8
        args.warmup_steps = 0

    #Write for logging; assumes no existing logs.
    mode = 'w'

    ## Initialize logs
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    logger = Logger(os.path.join(args.log_dir, 'log.txt'), mode)
    # Record args
    log_args(args, logger)
    set_seed(args.seed)

    # Data
    print("Preparing data")
    train_data, val_data, test_data = prepare_data(args, train=True)

    print("Setting up loader")
    loader_kwargs = {
        'batch_size': args.batch_size,
        'num_workers': 4,
        'pin_memory': True
    }
    train_loader = train_data.get_loader(train=True,
                                         reweight_groups=args.reweight_groups,
                                         **loader_kwargs)
    val_loader = val_data.get_loader(train=False,
                                     reweight_groups=None,
                                     **loader_kwargs)
    test_loader = test_data.get_loader(train=False,
                                       reweight_groups=None,
                                       **loader_kwargs)

    data = {}
    data['train_loader'] = train_loader
    data['val_loader'] = val_loader
    data['test_loader'] = test_loader
    data['train_data'] = train_data
    data['val_data'] = val_data
    data['test_data'] = test_data
    n_classes = train_data.n_classes

    log_data(data, logger)

    ## Initialize model
    if args.model == 'resnet50':
        model = torchvision.models.resnet50(pretrained=True)
        d = model.fc.in_features
        model.fc = nn.Linear(d, n_classes)
        if args.mc_dropout:
            model = add_dropout(model, 'fc')
    elif args.model == 'densenet121':
        model = torchvision.models.densenet121(pretrained=True)
        d = model.classifier.in_features
        model.classifier = nn.Linear(d, n_classes)
        if args.mc_dropout:
            model = add_dropout(model, 'classifier')
    elif args.model == 'bert-base-uncased':
        print("Loading bert")
        model = BertForSequenceClassification.from_pretrained(
            args.model, num_labels=n_classes)
    else:
        raise ValueError('Model not recognized.')

    logger.flush()
    criterion = torch.nn.CrossEntropyLoss(reduction='none')
    print("Getting loggers")
    train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'),
                                      train_data.n_groups,
                                      mode=mode)
    val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'),
                                    train_data.n_groups,
                                    mode=mode)
    test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'),
                                     train_data.n_groups,
                                     mode=mode)

    print("Starting to train...")
    train(model,
          criterion,
          data,
          logger,
          train_csv_logger,
          val_csv_logger,
          test_csv_logger,
          args,
          epoch_offset=0,
          train=True)

    train_csv_logger.close()
    val_csv_logger.close()
    test_csv_logger.close()

    if args.save_preds:
        save_preds(model, data, args)
        return