예제 #1
0
def concatenate_page(base_dir, manuscript, page, columns, rows):
    '''
    Concatenate image blocks into a single page (still jpg).
    '''
    # How concat blocks into row looks in shell:
    # montage -mode concatenate -tile x1 `ls -1cr add_ms_24686_f044r_5_*` row_0.jpg
    for row in range(rows + 1):
        row_filename = J(base_dir, manuscript, page, 'row_{0}.jpg'.format(row))
        if os.path.exists(row_filename):
            continue

        glob_name = '{0}_{1}_*.jpg'.format(J(base_dir, manuscript, page, page),
                                           row)
        row_blocks = sorted(glob.glob(glob_name), key=natural_keys)
        cmd = ('montage -mode concatenate -tile x1'.split() + row_blocks +
               [row_filename])
        call(cmd)
        put('.')

    # How concat rows into page looks in shell:
    # montage -mode concatenate -tile 1x `ls -1cr row_*` add_ms_24686_f044r.jpg
    page_filename = J(base_dir, manuscript, page) + '.jpg'
    if os.path.exists(page_filename):
        return

    glob_name = '{0}_*.jpg'.format(J(base_dir, manuscript, page, 'row'))
    rows = sorted(glob.glob(glob_name), key=natural_keys)
    cmd = ('montage -mode concatenate -tile 1x'.split() + rows +
           [page_filename])
    call(cmd)
    put('\n')
예제 #2
0
def convert_pages(base_dir, manuscript, pages):
    '''
    Convert manuscript images into PDFs and join into single PDF.
    '''
    for i, page in enumerate(pages):
        input_name = J(base_dir, manuscript, '{0}.jpg'.format(page))
        output_name = J(base_dir, manuscript, '{0}.pdf'.format(page))
        if os.path.exists(output_name):
            continue
        print('Converting page {0} ({1}/{2})'.format(page, i + 1, len(pages)))
        cmd = ['convert', input_name, output_name]
        call(cmd)
예제 #3
0
def load_specifications(specification_dir):
    """Loads experiment specifications from a specified directory.
    
    Args:
        specification_dir (str): The specified directory containing experiment specifications.
    
    Returns:
        list(dict): A list of experiment specification JSONs.
    """
    assert E(
        specification_dir), "Specification directory {} does not exist".format(
            specification_dir)

    specification_jsons = glob.glob(J(specification_dir, '*.json'))

    logger.info("Loading experiment specificaitons...")
    if not specification_jsons:
        logger.warning(
            "Could not find any experiment specifications in {}".format(
                specification_dir))

    specs = []
    for spec_path in specification_jsons:
        with open(spec_path, 'r') as f:
            specs.append(json.load(f))
    logger.info("Found {} experiment specifications".format(len(specs)))

    return specs
예제 #4
0
def fold_pages(base_dir, manuscript, pages, output_name):
    '''
    Fold pdf pages into one by applying concat operation to a pair of docs.
    '''
    tmp_name = J(base_dir, manuscript + '.pdf.tmp')
    pdfs = ['{0}.pdf'.format(page) for page in pages]
    for i, pdf in enumerate(pdfs):
        print('Folding page {0} ({1}/{2})'.format(pdf, i + 1, len(pages)))
        pdf_name = J(base_dir, manuscript, pdf)
        if os.path.exists(output_name):
            cmd = ['pdftk', output_name, pdf_name, 'cat', 'output', tmp_name]
            call(cmd)
            os.unlink(output_name)
            os.rename(tmp_name, output_name)
        else:
            shutil.copy2(pdf_name, output_name)
예제 #5
0
def convert_manuscript(resolution, base_dir, manuscript, pages):
    '''
    Convert manuscript and fold its pages into a single PDF.
    '''
    convert_pages(base_dir, manuscript, pages)
    suffix = '-p{0}-r{1}.pdf'.format(len(pages), resolution)
    output_name = J(base_dir, manuscript + suffix)
    fold_pages(base_dir, manuscript, pages, output_name)
예제 #6
0
def start_server(name):
    try:
        if name == SERVER_REALMD:
            process_name = J(MANGOS_DIR, REALMD_BIN)
        elif name == SERVER_WORLDD:
            process_name = J(MANGOS_DIR, MANGOSD_BIN)
        count = int(
            os.popen("ps ax|grep %s | grep -v grep | wc -l" %
                     process_name).read().strip())
        if count >= 1:
            logger.warn(
                'Requested for start, but look for server already started. %s'
                % count)
            return
    except Exception, e:
        logger.error('%s' % e)
        mail_admins(traceback.format_exc())
예제 #7
0
def parameter_count(spec, experiment_directory):
    # spec, experiment_directory = args

    # Unpack some of the specification information
    try:
        spec = set_spec_default_values(spec)

        algorithm = spec["algorithm"]
        batch_size = spec['batch_size']
        bptt_len = spec['bptt_len']
        spec['device'] = 'cpu'
        device = 'cpu'
        hmm_hidden = spec['hmm_hidden']
        max_step = spec['max_step']
        name = spec['name']
        sequence_dependence = spec['sequence_dependence']
        vocab = spec['vocab']
        # Unpack additional arguments <here>

    except KeyError:
        print("Invalid experiment specification: {}".format(spec))
        raise

    logging.basicConfig(level=logging.DEBUG)
    # filename=J(experiment_directory, 'out.log'),
    # filemode='w')
    logger = logging.getLogger('exp_runner')

    logger.info("Starting the parameter counter!")
    logger.info(str(spec))

    # Create the directory
    if not os.path.exists(experiment_directory):
        os.makedirs(experiment_directory)
    else:
        assert c.EXPERIMENT_RUNNER_SHOULD_OVERWRITE, "Experiment directory {} already exists".format(
            experiment_directory)

    # Choose sequence model type
    if algorithm == 'transformer':
        sequence_model = TransformerXL(**spec)
    elif algorithm == 'lstm':
        sequence_model = LSTMModel(**spec)
    elif algorithm == 'cnn':
        sequence_model = GatedCNN(**spec)
    else:
        print(spec)

    # Model
    model = sequence_model.get_model()
    pp = 0
    for p in list(model.parameters()):
        nn = 1
        for s in list(p.size()):
            nn = nn * s
        pp += nn
    print(pp)
    np.save(J(experiment_directory, 'parameters.npy'), [pp])
예제 #8
0
def replace_practitioner(combined_dataset_path, id1, id2):
    """Merges all files with id1 into files with id2"""
    files = list(filter(lambda x: x.endswith('json'), os.listdir(combined_dataset_path)))
    # First, find the object for id2 practitioner.
    prac_object = None
    for fname in files:
        if fname.replace('practitioner', '').startswith(id2):
            with open(J(combined_dataset_path, fname), 'r', encoding='utf-8') as f:
                prac_object = json.loads(f.read())['entry'][0]['entry'][1]
            break
    # Replace it all
    for fname in files:
        if fname.replace('practitioner', '').startswith(id1):
            new_obj = None
            with open(J(combined_dataset_path, fname), 'r', encoding='utf-8') as f:
                old = f.read()
                id_replaced = old.replace(id1, id2)
                new_obj = json.loads(id_replaced)
                for x in range(len(new_obj['entry'])):
                    new_obj['entry'][x]['entry'][1] = prac_object
            new_path = fname.replace(id1, id2)
            if os.path.exists(J(combined_dataset_path, new_path)):
                complete_obj = None
                with open(J(combined_dataset_path, new_path), 'r', encoding='utf-8') as f:
                    complete_obj = json.loads(f.read())
                    complete_obj['entry'].extend(new_obj['entry'])
                with open(J(combined_dataset_path, new_path), 'w', encoding='utf-8') as f:
                    f.write(json.dumps(complete_obj, indent=2))
            else:
                with open(J(combined_dataset_path, new_path), 'w', encoding='utf-8') as f:
                    f.write(json.dumps(new_obj, indent=2))
            os.remove(J(combined_dataset_path, fname))
예제 #9
0
def group_count_pracs(combined_dataset_path):
    filename_regex = r'practitioner(.+)_.*'
    pracs = {}
    files = list(filter(lambda x: x.endswith('json'), os.listdir(combined_dataset_path)))
    for file in files:
        with open(J(combined_dataset_path, file), 'r') as f:
            useless_bundle = json.loads(f.read())
        prac_id = re.match(filename_regex, file).group(1)
        if prac_id not in pracs:
            pracs[prac_id] = 0
        pracs[prac_id] += len(useless_bundle['entry'])
    return pracs
예제 #10
0
def setup_config():
    cfg = ConfigParser(CFG_DEFAULTS)
    conf_file = J(WORK_DIR, 'checker.conf')
    cfg.read(conf_file)
    if not cfg.has_section('checker'):
        cfg.add_section('checker')
    if not cfg.has_section('mangos'):
        cfg.add_section('mangos')
    fp = open(conf_file, 'wt')
    cfg.write(fp)
    fp.close()
    return cfg
예제 #11
0
def rst2html(rst, theme=None, opts=None):
    rst_opts = default_rst_opts.copy()
    if opts:
        rst_opts.update(opts)
    rst_opts['template'] = 'var/themes/template.txt'

    stylesheets = ['basic.css']
    if theme:
        stylesheets.append('%s/%s.css' % (theme, theme))
    rst_opts['stylesheet'] = ','.join(
        [J('var/themes/', p) for p in stylesheets])

    out = publish_string(rst, writer_name='html', settings_overrides=rst_opts)

    return out
예제 #12
0
def main(specification_dir, out_dir, num_gpus, exps_per_gpu):
    """Run the experiment orchestrator
    """

    # 1. Load the specifications
    specs = load_specifications(specification_dir)

    # 2. Create the output directory
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if os.listdir(out_dir):
        logger.warning(
            "The output directory {} is not empty. Are you sure you want to continue?"
            .format(out_dir))
        # time.sleep(3)

    # 3. Create the workers with specific environment variables
    num_workers = num_gpus * exps_per_gpu

    with NonDaemonPool(num_workers) as pool:
        logger.info("Created {} workers".format(num_workers))

        # Create the available device queue.
        m = multiprocessing.Manager()
        available_devices = m.Queue()
        for g in range(num_gpus):
            for _ in range(exps_per_gpu):
                available_devices.put(g)

        # 4. Create and distribute the workload
        workload = list(
            sorted([(spec, J(out_dir, spec["name"]), available_devices)
                    for spec in specs],
                   key=lambda x: (1 + 10000 * x[0]['depth']) * x[0]['width']))

        logger.info("Running {} jobs accross {} GPUs".format(
            len(workload), num_gpus))

        # 5. Launch the workers.
        logger.info("Launching the workers using `run_experiment`.")
        list(pool.imap_unordered(launch_experiment_on_device, workload))
        # pool.join()

    logger.info("Success, all experiments completed!")
예제 #13
0
def main(specification_dir, start, end, info):
    # First construct the cartesian product 
    if info:
        print_info()
        return 

    vals = c.HYPERPARAMETERS.values()
    product = itertools.product(*vals)
    product_to_dict = [{
        k: v[i] for i, k in enumerate(c.HYPERPARAMETERS)
    } for v in product]

    # Create the specification directory
    if not E(specification_dir):
        os.makedirs(specification_dir)
    else:
        if os.listdir(specification_dir):
            logger.warning(
                "Specification directory is not empty, "
                "are you sure you want to create it.")
    
    logger.info("Making specifications.")

    for i, spec in enumerate(product_to_dict):
        # Set the name
        file_name = "{}_{}".format(spec['algorithm'], i)
        spec["name"] = file_name
        alg = spec["algorithm"]
        for k in c.ALGORITHM_SPECIFIC_PARAMETERS[alg]:
            spec[k] = c.ALGORITHM_SPECIFIC_PARAMETERS[alg][k]

        spec["embedding_dim"] = spec["width"]

        for key, value in c.DEFAULT_VALUES_SPEC.items():
            if key not in spec:
                spec[key] = value
        

        with open(J(specification_dir, file_name + ".json"), "w") as f:
            f.write(json.dumps(spec))

    logger.info("Specifications complete.")
예제 #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("output_directory",
                        type=str,
                        help="The experiment output directory")
    args = parser.parse_args()

    assert E(
        args.output_directory), "Output directory {} does not exist".format(
            args.output_directory)

    experiments = glob.glob(J(args.output_directory, "*"))

    for ex in tqdm.tqdm(experiments):
        try:
            losses = np.load(J(ex, "losses.npy"))
            test_perplexity, test_acc = zip(
                *np.load(J(ex, "test_performance.npy")))
            train_perplexity, train_acc = zip(
                *np.load(J(ex, "train_performance.npy")))
        except FileNotFoundError as e:
            continue
        ls = moving_average(losses, 50)
        plt.scatter(range(len(ls)), ls, s=1)

        plt.title("Loss")
        plt.savefig(J(ex, 'losses.png'))
        plt.clf()
        plt.figure()
        plt.plot(train_perplexity, label="Train")
        plt.plot(test_perplexity, label="Test")
        plt.legend()
        plt.title("Perplexity")
        plt.savefig(J(ex, 'perplexity.png'))
        plt.clf()

        plt.figure()
        plt.plot(train_acc, label="Train")
        plt.plot(test_acc, label="Test")
        plt.legend()
        plt.title("Accuracy")
        plt.savefig(J(ex, 'accuracy.png'))
        plt.clf()
window = 10
epochs = 20
training = False

files = [
    "rd1_train.csv",
    "rd1_testA.csv",
    # "rd2_train.csv",
    # "rd1_testB.csv",
]

if __name__ == '__main__':

    rpt_list = []
    for f in files:
        df = pd.read_csv(J(data_path, f), index_col=0)
        rpt_list += [[c for c in t.strip().split()] for t in df.desc.tolist()]

    print("number of rpt:", len(rpt_list))

    # logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    # logging.root.setLevel(level=logging.INFO)

    wv_model = Word2Vec(size=embedding_size,
                        min_count=1,
                        sg=1,
                        workers=4,
                        window=window)
    wv_model.build_vocab(rpt_list)
    if training:
        wv_model.train(rpt_list,
예제 #16
0
import yaml
import pytest

from collections import defaultdict
from os import pardir
from os.path import join as J
from os.path import dirname, realpath

from pyBabyMaker.babymaker import BabyVariable, BabyMaker, BabyConfigParser, \
    BabyVariableResolver
from pyBabyMaker.base import UniqueList
from pyBabyMaker.io.NestedYAMLLoader import NestedYAMLLoader
from pyBabyMaker.io.TupleDump import PyTupleDump

PWD = dirname(realpath(__file__))
PARDIR = J(PWD, pardir)
SAMPLE_YAML = J(PARDIR, 'samples', 'sample-babymaker.yml')
SAMPLE_ROOT = '../samples/sample.root'
SAMPLE_FRIEND = '../samples/sample_friend.root'
SAMPLE_TMPL = J(PARDIR, 'pyBabyMaker', 'cpp_templates', 'babymaker.cpp')
SAMPLE_CPP = J(PARDIR, 'samples', 'sample-babymaker.cpp')

######################
# Variable container #
######################


def test_BabyVariable_default():
    var = BabyVariable('stuff')

    assert var.input is False
예제 #17
0
def read_dataset(dataset_path):
    files = list(filter(lambda x: x.endswith('json'),
                        os.listdir(dataset_path)))

    organizations = {}
    practitioners = {}
    patients = {}
    encounters = {}
    observations = {}

    for file in files:
        print('Reading file', file)

        with open(J(dataset_path, file), 'r') as f:
            bundle_group = json.loads(f.read())['entry']

        for bundle in bundle_group:
            local_organizations = list(
                filter(
                    lambda x: x['resource']['resourceType'] == 'Organization',
                    bundle['entry']))
            local_practitioners = list(
                filter(
                    lambda x: x['resource']['resourceType'] == 'Practitioner',
                    bundle['entry']))
            local_patients = list(
                filter(lambda x: x['resource']['resourceType'] == 'Patient',
                       bundle['entry']))
            local_encounters = list(
                filter(lambda x: x['resource']['resourceType'] == 'Encounter',
                       bundle['entry']))
            local_observations = list(
                filter(
                    lambda x: x['resource']['resourceType'] == 'Observation',
                    bundle['entry']))

            if len(local_organizations) > 1:
                print("Oh no. More than one organization in bundle.")
            if len(local_practitioners) > 1:
                print("Oh no. More than one practitioner in bundle.")
            if len(local_patients) > 1:
                print("Oh no. More than one patient in bundle.")

            local_organization: Organization = Organization(
                local_organizations[0])
            local_practitioner: Practitioner = Practitioner(
                local_practitioners[0])
            local_patient: Patient = Patient(local_patients[0])
            local_encounters: List[Encounter] = list(
                map(Encounter, local_encounters))
            local_observations: List[Observation] = list(
                map(Observation, local_observations))

            if local_organization.id not in organizations:
                organizations[local_organization.id] = local_organization
            else:
                local_organization = organizations[local_organization.id]

            if local_practitioner.id not in practitioners:
                practitioners[local_practitioner.id] = local_practitioner
            else:
                local_practitioner = practitioners[local_practitioner.id]

            if local_patient.id not in patients:
                patients[local_patient.id] = local_patient
            else:
                print("Oh no. Duplicate patient.")
                local_patient = patients[local_patient.id]

            local_patient.organization = local_organization.id
            local_patient.practitioner = local_practitioner.id
            local_patient.observations = list(
                map(lambda x: x.id, local_observations))
            local_patient.encounters = list(
                map(lambda x: x.id, local_encounters))
            local_organization.patients.append(local_patient.id)
            local_practitioner.patients.append(local_patient.id)

            if local_practitioner.id not in local_organization.practitioners:
                local_organization.practitioners.append(local_practitioner.id)
            if local_organization.id not in local_practitioner.organizations:
                local_practitioner.organizations.append(local_organization.id)

            for local_encounter in local_encounters:
                if local_encounter.id in encounters:
                    print("Oh no. Duplicate encounter.")
                if local_encounter.patient != local_patient.id:
                    print(
                        "Oh no. Encounter patient id does not match with bundle patient."
                    )
                encounters[local_encounter.id] = local_encounter

            local_encounters_ids = list(map(lambda x: x.id, local_encounters))
            for local_observation in local_observations:
                if local_observation.id in observations:
                    print("Oh no. Duplicate encounter.")
                if local_observation.patient != local_patient.id:
                    print(
                        "Oh no. Observation patient id does not match with bundle patient."
                    )
                if local_observation.encounter not in local_encounters_ids:
                    print("Oh no. Observation encounter not found in bundle.")

                observations[local_observation.id] = local_observation

    print(
        "Read data:\n{} Organizations\n{} Practitioners\n{} Patients\n{} Encounters\n{} Observations"
        .format(len(organizations), len(practitioners), len(patients),
                len(encounters), len(observations)))

    return {
        'organizations': organizations,
        'practitioners': practitioners,
        'patients': patients,
        'encounters': encounters,
        'observations': observations
    }
    "folder_id",
    "fold_start",
}

if __name__ == "__main__":

    if len(sys.argv) < 3:
        print("args: dst src")
        sys.exit(-1)

    dst = sys.argv[1]
    path = sys.argv[2]

    assert '.json' not in path

    src = glob(J(path, "**", "config.json"))[0]

    print("SRC:", src)
    print("DST:", dst)

    with open(src, 'r', encoding='utf-8') as f:
        conf_src = json.load(f)

    with open(dst, 'r', encoding='utf-8') as f:
        conf_dst = json.load(f)

    for k, v in conf_src.items():
        if k in conf_dst and k not in ignore and v != conf_dst[k]:
            print("CHANGE {}: {} -> {}".format(k, conf_dst[k], v))
            conf_dst[k] = v
예제 #19
0
import os
import sys
from glob2 import glob
from os.path import join as J
import shutil
import pandas as pd

if __name__ == "__main__":

    if len(sys.argv) < 3:
        print("please give path and n!")
        sys.exit(-1)

    root_path = sys.argv[1]
    n = int(sys.argv[2])

    files = glob(J(root_path, "**", "info.csv"))

    for f in files:
        df = pd.read_csv(f)
        fds = df.iloc[:, 0].tolist()
        if len(fds) <= n: continue
        path = os.path.split(f)[0]
        for t in fds[n:]:
            fd = J(path, t)
            print("REMOVE", fd)
            try:
                shutil.rmtree(fd)
            except Exception as e:
                print(e)
예제 #20
0
from time import time, sleep
import datetime

from redis import Redis

import smtplib
from smtplib import SMTPSenderRefused
import logging
import traceback
import logging.handlers
import cPickle as pickle
from subprocess import PIPE, Popen
from multiprocessing import Process
from ConfigParser import ConfigParser, NoSectionError

WORK_DIR = J(os.environ['HOME'], '.mangop')

################## default settings for checker.conf ##############
#

CFG_DEFAULTS = {
    'time_to_wakeup': 90,
    'mangos_dir': '/home/mangos/bin/used_rev/bin/',
    'mangos_log_dir': '/var/log/mangos/',
    'run_socket_path': J(WORK_DIR, 'run.sock'),
    'mangosd_bin': 'mangosd',
    'realmd_bin': 'realmd',
    'redis_port': 6379,
    'redis_host': 'localhost',
    'smtp_host': 'localhost',
    'smtp_from': '*****@*****.**',
예제 #21
0
def main(args):
    for name in args.names:
        download_manuscript(args.pages, args.resolution,
                            J(args.base_dir, str(args.resolution)), name)
예제 #22
0
    def train(self,
              F,
              model,
              dl_tr,
              dl_val=None,
              forward_batch_fun=None,
              get_loss_fun=None,
              eval_fun=None,
              step_fun=None,
              hold_best_model=False,
              optimizer=None,
              verbose=1,
              stop_cond=None,
              lr_scheduler=None,
              **kws):

        self.best_score = None
        self.best_epoch = None
        self.best_model = None
        self.swa_model = None
        self.n_avg = 0

        if lr_scheduler is not None:
            assert optimizer is not None

        if forward_batch_fun is None: forward_batch_fun = _forward_batch
        if get_loss_fun is None: get_loss_fun = _get_loss
        if eval_fun is None: eval_fun = _eval_model
        if step_fun is None: step_fun = _train_step

        old_flag = copy.deepcopy(F.__dict__)

        dl_val = dl_tr if dl_val is None else dl_val

        ignore_keys = None
        if F.not_save_keys_file is not None:
            with open(F.not_save_keys_file, 'r') as f:
                ignore_keys = [
                    t.strip() for t in f.readlines() if len(t.strip()) > 0
                ]

        if optimizer is not None:
            F.optimizer = optimizer.__class__.__name__
        else:
            if F.optimizer.lower() == "sgd":
                optimizer = torch.optim.SGD(lr=F.lr,
                                            params=model.parameters(),
                                            momentum=F.momentum,
                                            weight_decay=F.weight_decay)
            elif F.optimizer.lower() == "adam":
                optimizer = torch.optim.Adam(lr=F.lr,
                                             params=model.parameters(),
                                             weight_decay=F.weight_decay)
            elif F.optimizer.lower() == "adamw":
                optimizer = torch.optim.AdamW(lr=F.lr,
                                              params=model.parameters(),
                                              weight_decay=F.weight_decay)
            else:
                print("optimizer not found or not support!")
                sys.exit(-1)

        if F.resume_path is not None:
            optimizer.load_state_dict(
                torch.load(J(F.resume_path, "optimizer.pth")))
            model.load_state_dict(torch.load(J(F.resume_path, "model.pth")),
                                  strict=False)
            F.start_epoch = json.load(
                open(J(F.resume_path, "info.json"), mode='r',
                     encoding='utf-8'))['epoch'] + 1
            if lr_scheduler is not None:
                lr_scheduler.load_state_dict(
                    torch.load(J(F.resume_path, "lr_scheduler.pth")))

        L = Logger(verbose=verbose)
        if F.folder_id is None:
            F.folder_id = "={}=".format(get_name())
        else:
            F.folder_id = "={}_{}=".format(get_name(), F.folder_id)
        if F.resume_path is not None:
            logging_path = J(F.logging_path,
                             "={}=".format(F.resume_path.split('=')[1]))
        else:
            logging_path = J(F.logging_path, F.folder_id)
        if F.enable_logging:
            if not os.path.exists(logging_path):
                os.mkdir(logging_path)
            L.add_file_handler(J(logging_path, 'log.txt'), mode='a')

        if F.resume_path is not None:
            saving_path = os.path.split(F.resume_path)[0]
        else:
            saving_path = J(F.saving_path, F.folder_id)
        if F.enable_saving and not os.path.exists(saving_path):
            os.mkdir(saving_path)

        F.logging_path = logging_path
        F.saving_path = saving_path

        for k, v in F.__dict__.items():
            L.info("{} = {}".format(k, v))

        if F.enable_saving:
            save_config(F, J(F.saving_path, 'config.json'))

        if F.use_swa:
            swa_scheduler = SWALR(optimizer, swa_lr=F.swa_lr)

        with get_logger(logging_path) as L2, \
                    get_saver(saving_path, num_best=F.save_num_best, mode=F.save_mode,
                            every_epochs=F.save_every_epochs) as S:
            L2.disabled = not F.enable_logging
            S.disabled = not F.enable_saving
            _best = -np.inf if F.higher_better else np.inf
            _num = 0
            _best_epoch = 1
            for epoch in range(F.start_epoch, F.epochs + 1):
                model.train()
                for it, batch in enumerate(dl_tr):
                    loss, sc = step_fun(F, model, optimizer, batch,
                                        forward_batch_fun, get_loss_fun, **kws)
                    L.debug("[{}/{}][{}/{}] - {}".format(
                        epoch, F.epochs, it + 1, len(dl_tr), " - ".join([
                            "{}: {:.3f}".format(k, v) for k, v in sc.items()
                        ])))
                    L2.write(data=sc, step=(epoch - 1) * len(dl_tr) + it + 1)

                if not F.use_swa or epoch < F.swa_start:
                    model.eval()
                    score = eval_fun(F, model, dl_val, forward_batch_fun,
                                     **kws)
                else:
                    self.update_parameters(model)
                    self.swa_model.eval()
                    score = eval_fun(F, self.swa_model, dl_val,
                                     forward_batch_fun, **kws)
                    swa_scheduler.step()
                    score['lr'] = swa_scheduler.get_lr()[0]

                if lr_scheduler is not None and epoch < F.swa_start:
                    if lr_scheduler.__class__.__name__ == "ReduceLROnPlateau":
                        lr_scheduler.step(score[F.primary_score])
                    else:
                        lr_scheduler.step()
                    score['lr'] = lr_scheduler.get_lr()[0]

                L.info("[{}/{}][{}/{}] - {}".format(
                    epoch, F.epochs, len(dl_tr), len(dl_tr), " - ".join([
                        "{}: {:.3f}".format(k, v) for k, v in score.items()
                    ])))
                L2.write(data=score, step=epoch * len(dl_tr))
                save_state = {
                    F.save_model_name:
                    model if not F.use_swa or epoch < F.swa_start else
                    self.swa_model,
                    'optimizer':
                    optimizer
                }
                if lr_scheduler is not None:
                    save_state['lr_scheduler'] = lr_scheduler
                save_info = {'epoch': epoch, **score}
                S.check(
                    save_state,
                    cost=-score[F.primary_score]
                    if F.higher_better else score[F.primary_score],
                    epoch=epoch,
                    info=save_info,
                    ignore_keys=ignore_keys,
                )
                if F.enable_saving and F.save_last:
                    S.save_model(
                        save_state,
                        "last",
                        info=save_info,
                        ignore_keys=ignore_keys,
                    )

                if F.higher_better and score[
                        F.
                        primary_score] > _best or not F.higher_better and score[
                            F.primary_score] < _best:
                    _best = score[F.primary_score]
                    _num = 0
                    _best_epoch = epoch
                    self.best_score = score
                    self.best_epoch = _best_epoch
                    if hold_best_model:
                        self.best_model = copy.deepcopy(model)
                else:
                    _num += 1
                if F.early_stop and _num == F.early_stop_num:
                    L.info(
                        '>>>>>>>> Meet early-stopping, the best score is {} on epoch {} <<<<<<<<'
                        .format(_best, _best_epoch))
                    break
                if stop_cond is not None and stop_cond(score):
                    L.info(
                        '>>>>>>>> Meet cond-stopping, the best score is {} on epoch {} <<<<<<<<'
                        .format(_best, _best_epoch))
                    break

        if F.early_stop and _num < F.early_stop_num:
            L.info(
                '>>>>>>>> Do not meet early-stopping! The best score is {} on epoch {} <<<<<<<<'
                .format(_best, _best_epoch))
        if stop_cond is not None and not stop_cond(score):
            L.info('>>>>>>>> Do not meet cond-stopping! <<<<<<<<')

        L.clear()
        for k, v in old_flag.items():
            setattr(F, k, v)
예제 #23
0
def download_page(resolution, base_dir, manuscript, page):
    '''
    Download single page into base_dir/manuscript/page directory.
    There will be a bunch of block files that you will need to concatenate
    later.
    '''
    mkpath(J(base_dir, manuscript, page))

    # First download image block that is out of range to see how such image
    # looks like (this is used to detect edges later)
    nil_block = _session.get(
        URL_IMAGE_BLOCK.format(manuscript_and_page=page,
                               resolution=resolution,
                               column=999,
                               row=999))

    column, row = 0, 0
    max_column, max_row = 0, 0

    while True:
        filename = J(base_dir, manuscript, page,
                     '{0}_{1}_{2}.jpg'.format(page, row, column))

        #print('Getting block {0}x{1}'.format(row, column))
        url = URL_IMAGE_BLOCK.format(manuscript_and_page=page,
                                     resolution=resolution,
                                     column=column,
                                     row=row)

        try:
            download_block(url, filename, nil_block)
        except BlockAlreadyDownloaded:
            max_row = max(row, max_row)
            max_column = max(column, max_column)
            column += 1
            put('.')
            continue
        except BlockInvalid:
            put('\n')
            # We are out of range
            if column == 0:
                # The end of the page
                print('End of the page')
                print('Page {0} has size row x column = {1} x {2}'.format(
                    page, max_row, max_column))
                break
            else:
                # The end of the row, reset column, increment row
                column = 0
                row += 1
                continue
        except BlockMaxRetriesReached:
            put('X')
        else:
            put('.')

        # Update page size
        max_row = max(row, max_row)
        max_column = max(column, max_column)

        # Go to next column
        column += 1

    return max_column, max_row
예제 #24
0
def grid_search(
    F,
    T,
    model,
    dl_tr,
    dl_val,
    params,
    fast=True,
    out_path='.',
    verbose=-1,
    **kws,
):

    assert type(params) == dict
    for k in params:
        assert k in F.__dict__

    F.enable_logging = False
    F.enable_saving = False

    mid_params = {k: np.median(v) for k, v in params.items()}

    best_params = copy.deepcopy(mid_params)

    curr_params = copy.deepcopy(mid_params)

    for k, v in mid_params.items():
        setattr(F, k, v)

    recd = []
    _best = -np.inf if F.higher_better else np.inf
    for key, values in params.items():

        for v in values:

            print(bcolors.OKGREEN +
                  "****** training when {} = {}".format(key, v) + bcolors.ENDC)

            curr_params[key] = v
            setattr(F, key, v)

            T.train(
                F,
                model,
                dl_tr,
                dl_val,
                verbose=verbose,
                **kws,
            )

            curr_params['score'] = T.best_score[F.primary_score]
            recd.append(copy.deepcopy(curr_params))

            print(bcolors.OKGREEN +
                  ">>>>>> finish when {} = {}, best = {}".format(
                      key, v, curr_params['score']) + bcolors.ENDC)

            if (F.higher_better and T.best_score[F.primary_score] > _best) or (
                    not F.higher_better
                    and T.best_score[F.primary_score] < _best):
                _best = T.best_score[F.primary_score]
                best_params[key] = v

        curr_params[key] = best_params[key]
        setattr(F, key, best_params[key])

    print("=" * 60)
    for k, v in best_params.items():
        print(bcolors.OKCYAN + "{} = {}".format(k, v) + bcolors.ENDC)
    print(bcolors.OKCYAN + "best = {}".format(_best) + bcolors.ENDC)

    recd = pd.DataFrame(recd).sort_values(
        'score', ascending=not F.higher_better).reset_index(drop=True)
    recd.to_csv(J(out_path, "{}_record.csv".format(get_name())))
예제 #25
0
from os.path import join as J


def we_are_frozen():
    # All of the modules are built-in to the interpreter, e.g., by py2exe
    return hasattr(sys, "frozen")


def module_path():
    encoding = sys.getfilesystemencoding()
    if we_are_frozen():
        return os.path.dirname((sys.executable))
    return os.path.dirname((__file__))


ASSETS_DIR = os.path.abspath(J(os.path.dirname(__file__), '..', 'assets'))

J = os.path.join
E = os.path.exists

BASE_DIR = os.environ.get('MINERL_OUTPUT_ROOT', os.path.expanduser(
    J('~', 'minerl.data')
))

RENDERERS_DIR = os.path.expanduser(
    J('~', 'renderers'))
NUM_MINECRAFTS = 28

OUTPUT_DIR = J(BASE_DIR, 'output')
DOWNLOAD_DIR = J(BASE_DIR, 'downloaded_sync')
BUCKET_NAME = 'pizza-party'
예제 #26
0
import os
import sys
import json
import numpy as np
from glob2 import glob
from os.path import join as J

if __name__ == "__main__":

    if len(sys.argv) < 2:
        print("please give path!")
        sys.exit(-1)

    root_path = sys.argv[1]

    files = glob(J(root_path, "**", "info.json"))

    d = []

    for f in files:
        with open(f) as ff:
            d.append(json.load(ff))

    mean_score = {k: np.mean([t[k] for t in d]) for k in d[0].keys()}

    with open(J(root_path, "mean_score.json"), 'w') as f:
        json.dump(mean_score, f, indent=4)
    for k, v in mean_score.items():
        print("{} = {}".format(k, v))
예제 #27
0
def run_experiment(spec, experiment_directory):
    """Runs an experiment based on the desired experiment specification.
    This process will record the desired response variables and write them to the experiment directory.
    
    Args:
        spec (dict): The JSON object specifying the experiment to run.
        experiment_directory (str):  The directory path to which to write the response variables.
    """
    # spec, experiment_directory = args

    # Unpack some of the specification information
    try:
        spec = set_spec_default_values(spec)

        algorithm = spec["algorithm"]
        batch_size = spec['batch_size']
        bptt_len = spec['bptt_len']
        device = spec['device']
        hmm_hidden = spec['hmm_hidden']
        max_step = spec['max_step']
        name = spec['name']
        sequence_dependence = spec['sequence_dependence']
        vocab = spec['vocab']
        # Unpack additional arguments <here>

    except KeyError:
        print("Invalid experiment specification: {}".format(spec))
        raise

    logging.basicConfig(level=logging.DEBUG)
    # filename=J(experiment_directory, 'out.log'),
    # filemode='w')
    logger = logging.getLogger('exp_runner')

    logger.info("Starting the experiment!")
    logger.info(str(spec))

    # Create the directory
    if not os.path.exists(experiment_directory):
        os.makedirs(experiment_directory)
    else:
        assert c.EXPERIMENT_RUNNER_SHOULD_OVERWRITE, "Experiment directory {} already exists".format(
            experiment_directory)

    # Output a copy of the experiment specification
    with open(J(experiment_directory, 'params.json'), 'w') as f:
        json.dump(spec, f)

    # Choose sequence model type
    if algorithm == 'transformer':
        sequence_model = TransformerXL(**spec)
    elif algorithm == 'lstm':
        sequence_model = LSTMModel(**spec)
    elif algorithm == 'cnn':
        sequence_model = GatedCNN(**spec)
    else:
        print(spec)

    # TODO: loop over trainig files/algorithm specification
    ROOT_PATH = 'generated_data'
    DATA_FILE = 'V{}_hmm_hidden_{}_lag_{}_vocab_{}.txt'.format(
        c.DATA_GENERATION_VERSION, hmm_hidden, sequence_dependence, vocab)
    train_file = 'train_' + DATA_FILE
    test_file = 'test_' + DATA_FILE
    device = torch.device(device)

    # Create dataset iterators
    train_iter, test_iter = torchtext_batch_iterators(ROOT_PATH,
                                                      train_file,
                                                      test_file,
                                                      batch_size=batch_size,
                                                      bptt_len=bptt_len,
                                                      device=device,
                                                      batch_first=True,
                                                      repeat=False)

    train_perplex_iter, test_perplex_iter = torchtext_batch_iterators(
        ROOT_PATH,
        train_file,
        test_file,
        batch_size=batch_size,
        bptt_len=bptt_len,
        device=device,
        batch_first=True,
        repeat=False)

    # Model
    model = sequence_model.get_model()
    optimizer = sequence_model.get_optimizer()
    scheduler = sequence_model.get_scheduler()

    max_step = spec['max_step']
    eval_steps = spec["eval_steps"]
    train_step = 0
    train_loss = 0
    best_val_loss = None

    losses = []
    test_performance = []
    train_performance = []
    step_to_performance = []

    num_steps = 0
    # Training Loop

    tqdm_out = TqdmLogger(logger, level=logging.INFO)
    progress = tqdm.tqdm(total=max_step, )

    try:
        for epoch in itertools.count(start=1):
            model.train()
            mems = tuple()
            print()

            for train_step, batch in enumerate(train_iter):
                num_steps += 1
                progress.update()
                loss = sequence_model.train_step(batch.text,
                                                 batch.target,
                                                 mems=mems)
                losses.append(loss)
                progress.set_description("Loss {:.4f}".format(loss))

                # Update scheduler
                sequence_model.update_scheduler(num_steps)

                if num_steps % 500 == 0:
                    progress.write("Saving loss performance!")
                    np.save(J(experiment_directory, 'losses.npy'), losses)
                    np.save(J(experiment_directory, 'test_performance.npy'),
                            test_performance)
                    np.save(J(experiment_directory, 'train_performance.npy'),
                            train_performance)
                    np.save(J(experiment_directory, 'step_to_performance.npy'),
                            step_to_performance)

                if num_steps % 1000 == 0:
                    # Calculate perplexity
                    progress.write("-" * 100)
                    progress.write("Model Performance:")
                    test_performance.append(
                        evaluate_model(sequence_model, test_perplex_iter, 2000,
                                       vocab))
                    train_performance.append(
                        evaluate_model(sequence_model, train_perplex_iter,
                                       1000, vocab))
                    step_to_performance.append(num_steps)
                    progress.write(
                        "Test (Perplex, Accuracy): {:.6f}, {:.6f}".format(
                            *test_performance[-1]))
                    progress.write(
                        "Train (Perplex, Accuracy): {:.6f}, {:.6f}".format(
                            *train_performance[-1]))
                    progress.write("Average loss (past 1000): {}".format(
                        np.mean(losses[-1000:])))

                if num_steps >= max_step:
                    break

            if num_steps >= max_step:
                progress.write('-' * 100)
                progress.write('End of training')
                break

            # if val_loss is None or val_loss < best_val_loss:
            #     best_val_loss = val_loss
            #     # TODO: save the best performing model so far(and its stats)

    except KeyboardInterrupt:
        logger.info('-' * 100)
        logger.info('Exiting from training early')
        raise
예제 #28
0
        lab = df.label.map(lambda t: func(t, NUM_LABEL))
        lab = pd.DataFrame(np.stack(lab.values), index=lab.index)
        ret['label'] = lab
        df.drop('label', inplace=True, axis=1)
    if "label2" in df.columns:
        lab = df.label2.map(lambda t: func(t, NUM_LABEL2))
        lab = pd.DataFrame(np.stack(lab.values), index=lab.index)
        ret['label2'] = lab
        df.drop('label2', inplace=True, axis=1)
    ret['desc'] = df
    return ret


print("preprocessing data ...")

ret = process_data(J(data_path, rd1_train_name))
ret['desc'].to_csv(J(out_path, "rd1_train.csv"))
ret['label'].to_csv(J(out_path, "rd1_train_label.csv"))

ret = process_data(J(data_path, rd1_testA_name))
ret['desc'].to_csv(J(out_path, "rd1_testA.csv"))

ret = process_data(J(data_path, rd1_testB_name))
ret['desc'].to_csv(J(out_path, "rd1_testB.csv"))

ret = process_data(J(data_path, rd2_train_name))
ret['desc'].to_csv(J(out_path, "rd2_train.csv"))
ret['label'].to_csv(J(out_path, "rd2_train_label.csv"))
ret['label2'].to_csv(J(out_path, "rd2_train_label2.csv"))

try:
예제 #29
0
def getdatalist(train = True):
    if train:
        return open(trainlist,'r').readlines()
    else:
        return open(testlist,'r').readlines()

def getMapper(idxpath = "/data/keshav/ucf/ucflist/classInd.txt"):
    indexes = [*map(lambda x: x.strip(), open(idxpath, 'r').readlines())]
    return bidict({y: torch.tensor([int(x)-1]) for x, y in map(lambda i: i.split(), indexes)})

mapper = getMapper()


def randomSequenceChunk(x, n):
    start = random.randint(0,len(x)-n)
    end = start + n
    return x[start:end]

# x is single instance from open(testlist,'r').readlines()
getactualtestpath = lambda testpath:J(datapath,testpath.strip().replace('.avi',''))
getactualtrainpath = lambda trainpath:J(datapath,trainpath.split('.avi ')[0])
getframesfrompath = lambda x,n,pathgetter:randomSequenceChunk((order([*Path(pathgetter(x)).glob("*.jpg")])),n)
getactualpath = {True:getactualtrainpath, False:getactualtestpath}

# path : Instance from trainlist or testlist
# Returns n random frames in sequence from a video
def getXorY(path,n = 10, train = True):
    frames = getframesfrompath(path, n, getactualpath.get(train))
    label = mapper.get(frames[0].parent.parent.name)
    return data(frames = frames, label = label)
예제 #30
0
                seq_pad_meth=F.seq_pad_meth,
                seq_mask_ratio=0,
                seq_rep_prob=0,
                token_range=token_range,
            ),
        )

        model = crt_model(F).to(device)

        if F.pretrain_model_file is not None:
            if not os.path.isdir(F.pretrain_model_file):
                model.load_state_dict(torch.load(F.pretrain_model_file),
                                      strict=False)
            else:
                model.load_state_dict(torch.load(
                    glob(J(F.pretrain_model_file, "**", "last",
                           "model.pth"))[0]),
                                      strict=False)

        # base_opt = torch.optim.AdamW(lr=F.lr, params=model.parameters(), weight_decay=F.weight_decay)
        # lookahead = Lookahead(base_opt, k=5, alpha=0.5)
        # lr_scheduler = LambdaLR(base_opt, lr_lambda=lambda epoch: warmup_only(epoch))
        # lr_scheduler = CosineAnnealingWarmRestarts(base_opt, T_0=F.T_0, T_mult=1)

        T.train(
            F,
            model,
            dl_tr,
            dl_val,
            forward_batch_fun=forward_batch_fun,
            hold_best_model=False,
            stop_cond=lambda sc: sc['val_score'] > F.val_score_limit,