Exemplo n.º 1
0
    def get(self):
        increment_hit_counter(datastore_key_hits_streams_json)

        # Get database
        db = memcache.get(memcache_key_database)
        if db is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_database))
            db_json = ndb_get_entity(JsonDatabase,
                                     datastore_key_database).value
            db = utils.json_to_dict(db_json)
            memcache.set(memcache_key_database, db)

        # Get last update time
        last_update_time = memcache.get(memcache_key_last_update)
        if last_update_time is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_last_update))
            last_update_time = ndb_get_entity(Time,
                                              datastore_key_last_update).value
            memcache.set(memcache_key_last_update, last_update_time)

        json_obj = {'streams': db, 'last_update': last_update_time}
        json_str = utils.dict_to_json(json_obj)
        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json_str)
Exemplo n.º 2
0
def read_spec(spec, spec_dir):
    path = os.path.join(spec_dir, "{}.json".format(spec))
    assert os.path.exists(path), (
            "Specification file '{}' does not exist".format(path))

    spec_dict = utils.json_to_dict(path)
    return spec_dict
Exemplo n.º 3
0
def backup_database(backup_key):
    db = memcache.get(memcache_key_database)
    if db is None:
        logger.warn('memcache failed on key: {}'.format(memcache_key_database))
        db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value
        db = utils.json_to_dict(db_json)
        memcache.set(memcache_key_database, db)
    db_json = utils.dict_to_json(db)
    logger.info('Backup database to key: {}'.format(backup_key))
    ndb_set_value(JsonDatabase, backup_key, db_json)
Exemplo n.º 4
0
def update_database():
    db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value
    db = utils.json_to_dict(db_json)
    current_streams = streams.get_current_streams()

    updated_db = streams.update_database(db, current_streams)
    updated_db_json = utils.dict_to_json(updated_db)

    ndb_set_value(JsonDatabase, datastore_key_database, updated_db_json)
    memcache.delete(memcache_key_database)
    memcache.set(memcache_key_database, updated_db)
Exemplo n.º 5
0
def edit_database(db_json):
    db = utils.json_to_dict(db_json)

    utc_now = datetime.datetime.utcnow()
    datastore_key_database_backup = 'backup_edit_{}'.format(
        utc_now.strftime('%Y-%m-%d_%H-%M_%S'))
    backup_database(datastore_key_database_backup)

    ndb_set_value(JsonDatabase, datastore_key_database, db_json)
    memcache.delete(memcache_key_database)
    memcache.set(memcache_key_database, db)
Exemplo n.º 6
0
def get_data(infile, x_params, y_params, sport_types):
    """
    For each i, returns y_params[i] and x_params[i] where sport = sport_types[i]. Thus each point in the plot corresponds to one workout
    x_params  list of parameters, must be present in the data
    y_params  list of parameters, must be present in the data
    Must be true : len(x_params) == len(y_params)
    """
    assert(len(x_params) == len(y_params))
    assert(len(x_params) == len(sport_types))
    assert(len(x_params) > 0)
    assert(len(y_params) > 0)
    n_params = len(x_params)
    #assert(sport in ["Running", "Cycling", "Walking", "Circuit Training", "Mountain biking"])

    # create lists to store data objects
    objs = []
    #for s in sport_types:
        #objs[s] = []
    for i in range(0, n_params):
        objs.append(DataForPlot(xparam = x_params[i], yparam = y_params[i], sport = sport_types[i]))

    print "X parameters : " + str(x_params)
    print "Y parameters : " + str(y_params)
    print "Sports : " + str(sport_types)

    infile_basename, ext = os.path.splitext(infile)
    if (ext == ".gz"):
        f = gzip.open(infile)
    elif(ext == ".txt"):
        f = open(infile)
    else:
        raise Exception("File format not recognized")

    nw = 0
    for line in f:
        # each line is a workout
        w = utils.json_to_dict(line.strip())
        sport = w["sport"]
        for i in range(0, n_params):
            if (sport != sport_types[i]):
                continue
            xp = x_params[i]; yp = y_params[i]      # x and y axis parameters
            if (w.has_key(xp) and w.has_key(yp)):
                mx = w[xp]
                my = w[yp]
                objs[i].add_point(mx, my)
        nw += 1
        if (nw % 100000 == 0):
            print "Done processing %s workouts" % (nw)

    f.close()
    
    return objs
def condense_and_clean_data(infile, outfile):
    """
    infile must be a .gz file generated by the sql_to_json_parser.py
    condense_and_clean_data will do the following:
        - replace trace data by averages
        - replace strings like '2.35 mi' to 2.35
    """
    t1 = time.time()
    fo = gzip.open(outfile, "w")
    fi = gzip.open(infile)
    precision = 6   # 6 digits after decimal
    param_formatter = ParamFormatter(precision = precision)
    n = 0
    n_params_ignored = 0
    n_values_ignored = 0
    ignored_params = set()
    ignored_values = set()
    for line in fi:
        d = {}
        w = utils.json_to_dict(line.strip())
        for k, v in w.items():
            if (isinstance(v, list)):
                # replace trace data by averages
                v = round(numpy.mean(utils.remove_null_values_single(v)), precision)
                k = k + "(avg)"
                d[k] = v
            else:
                # convert and replace units - for example, convert '2.35 mi' to 2.35
                try:
                    v = param_formatter.to_number(k, v)
                    d[k] = v
                except InvalidValueException as e:
                    n_values_ignored += 1
                    ignored_values.add(e.value)
                except InvalidParamException as e:
                    n_params_ignored += 1
                    ignored_params.add(e.param)
        w_str = utils.dict_to_json(d)
        fo.write(w_str + "\n")
        n += 1
        if (n % 10000 == 0):
            print "Written %d workouts.." % (n)
    fi.close()
    fo.close()
    t2 = time.time()
    print "Time taken = " + str(t2 - t1) + " seconds"
    print "%d params ignored" % (n_params_ignored)
    print "List of ignored parameters : " + str(ignored_params)
    print "%d values ignored" % (n_values_ignored)
    print "List of ignored values : " + str(ignored_values)
    print "Total %d workouts written" % (n)
Exemplo n.º 8
0
def get_stats(infile):
    workouts_for_param = {}
    workouts_for_sport = {}
    workouts_for_user = {}
    with gzip.open(infile) as f:
        nlines = 0
        for line in f:
            d = utils.json_to_dict(line)

            # workouts per param
            for k in d.keys():
                if (not workouts_for_param.has_key(k)):
                    workouts_for_param[k] = 0
                workouts_for_param[k] += 1

            # workouts per sport type
            if (d.has_key("sport")):
                sport = d["sport"]
                if (not workouts_for_sport.has_key(sport)):
                    workouts_for_sport[sport] = 0
                workouts_for_sport[sport] += 1

            # workouts per user
            user = d["user_id"]
            if (not workouts_for_user.has_key(user)):
                workouts_for_user[user] = 0
            workouts_for_user[user] += 1

            nlines += 1
            if (nlines % 100000 == 0):
                print "Done with %d workouts.." % (nlines)

    # print stats
    print_stats(workouts_for_param, 100, "Parameter", "# Workouts")
    print_stats(workouts_for_user, 100, "User ID", "# Workouts")
    print_stats(workouts_for_sport, 100, "Sport", "# Workouts")

    n_users = len(workouts_for_user.keys())
    print "Total number of users : ", n_users
    d = sorted(workouts_for_user.items(), key=operator.itemgetter(1))
    #d.reverse()
    d_vals = [v for (k,v) in d]
    for i in range(10, 400, 30):
        print "Number of users with more than %d workouts : %d" % (i, n_users - np.searchsorted(d_vals, i))
Exemplo n.º 9
0
    def get(self):
        increment_hit_counter(datastore_key_hits_streams_json)

        # Get database
        db = memcache.get(memcache_key_database)
        if db is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_database))
            db_json = ndb_get_entity(JsonDatabase,
                                     datastore_key_database).value
            db = utils.json_to_dict(db_json)
            memcache.set(memcache_key_database, db)

        # Get last update time
        last_update_time = memcache.get(memcache_key_last_update)
        if last_update_time is None:
            logger.warn(
                'memcache failed on key: {}'.format(memcache_key_last_update))
            last_update_time = ndb_get_entity(Time,
                                              datastore_key_last_update).value
            memcache.set(memcache_key_last_update, last_update_time)

        json_obj = dict()
        for key, value in db.items():
            stream_type, stream_id = streams.database_type_and_id(key)
            if stream_type != 'afreeca':
                continue
            race = value['game_info']['race']
            nickname = value['nickname']
            json_obj[stream_id] = [nickname, race]

        # Output in Snipealot formatting
        json_str = '{\n'
        for key, value in sorted(json_obj.items()):
            json_str += '    "{}": [ "{}", "{}" ],\n'.format(
                key, value[0], value[1])
        json_str = json_str[:-2] + '\n}\n'

        self.response.headers['Content-Type'] = 'application/json'
        self.response.out.write(json_str)
def read_data_as_lists(infile, sport, params, min_distance = 1.0, max_distance = 100.0, min_data_points = 200, min_duration = 100.0, max_duration = 172800.0):   # min duration is 100 s
    print "Infile : ", infile
    print "params : ", params
    sport_missing = 0
    param_missing = 0
    n_ignore = 0
    n = 0
    data = []
    formatter = ParamFormatter()
    with gzip.open(infile) as f:
        for line in f:
            if (sport not in line):
                ignore = True
                sport_missing += 1
            elif ("hr" not in line or "distance" not in line or "duration" not in line):
                ignore = True
                param_missing += 1
            else:
                d = utils.json_to_dict(line)
                example = []
                ignore = False
                distance = float("-inf")
                duration = float("-inf")
                if (d.has_key("Distance")): 
                    try:
                        distance = formatter.to_number("Distance", d["Distance"])
                    except:
                        pass
                if (d.has_key("Duration")): 
                    try:
                        duration = formatter.to_number("Duration", d["Duration"])
                    except:
                        pass
                if (d["sport"] != sport):
                    ignore = True
                    sport_missing += 1
                elif (distance < min_distance or 
                        duration < min_duration or 
                        (d.has_key("hr") and len(d["hr"]) < min_data_points) or
                        duration > max_duration or
                        distance > max_distance):
                    ignore = True
                else:
                    for k in params:
                        if not d.has_key(k):
                            param_missing += 1
                            ignore = True
                            break
                        else:
                            example.append(d[k])
            if (ignore):
                n_ignore += 1
            else:
                data.append(example)
            n += 1
            if (n % 100000 == 0):
                print "%d workouts read.." % (n)

    print "%d workouts did not match the sport" % (sport_missing)
    print "%d workouts did not contain one or more parameters" % (param_missing)
    print "%d workouts ignored.." % (n_ignore)
    print "%d workouts successfully returned.." % (len(data))
    return data
Exemplo n.º 11
0
def generate_seed(project, bugnumber, output):
    """ generates a file that contains json info to run d4j and lithium """
    global max_files_per_bug
    initial_projects = ["Chart", "Lang", "Closure", "Math", "Mockito", "Time"]
    if project not in initial_projects:
        raise Exception("Project {} invalid. Please select one of {}".format(
            project, initial_projects))

    project_path = os.path.join(os.getcwd(), "data", project)

    if not os.path.isdir(project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception("Project {} directory not found".format(project_path))

    # Solves the issue of different source paths for the same project
    if project == 'Lang' and int(bugnumber) < 36:
        source_path = get_source_path(project + '2')
    elif project == 'Math' and int(bugnumber) > 84:
        source_path = get_source_path(project + '2')
    else:
        source_path = get_source_path(project)

    # get only bugs choosen by user
    bugnumber = bugnumber.split(",")

    if not is_input_number_valid(bugnumber, project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception(
            "one or more json files({}) are not found in path {}".format(
                bugnumber, project_path))

    bugnumbers = ['{}.json'.format(bug) for bug in bugnumber]

    if '0' in bugnumber:  # 0 similar to "all" bugs
        bugnumbers = os.listdir(project_path)
    else:
        bugnumbers = [
            doc for doc in os.listdir(project_path) if doc in bugnumbers
        ]

    with open(output, "w") as seed_file:
        # for each bug
        for bug in bugnumbers:
            data = json_to_dict(os.path.join(project_path, bug))
            bug_number = bug.replace(".json", "")
            classes = []
            # get rankings from morpho's report
            for item in data["rankings"]:
                java_file = os.path.join(source_path, item["class"])
                if java_file not in classes:
                    classes.append(java_file)
                    if len(classes) == max_files_per_bug:
                        break

            # get the top-k classes
            if len(classes) > 1:
                classes = ",".join(
                    classes)  # converts [classA, classB] to classA,classB
            else:
                classes = classes[0]  # get only line

            expected_dir = 'oracle/' + project_name + '/'
            expected_msg_path = expected_dir + bug_number

            i = 0
            f = 0
            c = 0
            with open(expected_msg_path) as f:
                failing = f.readlines()
                for l in failing:
                    if '---' in l:
                        testcase = l.strip().split(' ')[1]
                        seed_file.write("{} {} {} {} {}\n".format(
                            project, bug_number, testcase, classes,
                            expected_msg_path))
Exemplo n.º 12
0
import logging
import os

from utils import json_to_dict
from mongoengine import connect


def healthcheck(db_client):
    try:
        db_client.admin.command('ismaster')
    except Exception:
        logging.exception('Error while checking health')
        exit(1)

    exit(0)


if __name__ == "__main__":
    logging.basicConfig()
    config = json_to_dict(os.getenv('CONFIG_PATH', 'config/karmaconf.json'))
    db_config = config['MONGO']
    connection = connect(**db_config)
    healthcheck(connection)
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='Sweep test config generator')
    parser.add_argument("--id", type=str, help="id of wandb sweep")
    parser.add_argument("--base_config",
                        type=str,
                        help="Base config file for test setup")
    parser.add_argument(
        "--fields",
        type=str,
        default="",
        help="Fields to carry over from training config to test config")
    parser.add_argument(
        "--grouping",
        type=str,
        default="dataset,cgan_type",
        help=
        "Parameters to group by, only the best (in validation) model is tested"
    )
    parser.add_argument("--out_dir",
                        type=str,
                        help="Config file output directory")
    args = parser.parse_args()

    assert args.id, "Must specify id"
    assert args.out_dir, "Must specify output dir"
    assert args.base_config, "Must specify base test config"

    base_config = utils.json_to_dict(args.base_config)
    if args.fields:
        extra_fields = args.fields.split(",")
    else:
        extra_fields = []

    groupings = args.grouping.split(",")

    runs = utils.get_sweep_runs(args.id)

    # Groups are mapped using a string: grouping1_grouping2_grouping3 etc.
    best_in_groups = {}  # Dict of dataset-group pairs to (ll, run)

    for run in runs:
        ds = run.config["dataset"]
        group_values = [str(run.config[g]) for g in groupings]
        key = "_".join(group_values)

        # Exclude crashed runs
        if "log_likelihood" in run.summary:
            ll = run.summary["log_likelihood"]

            # Check for NaN or -inf
            if not (type(ll) == str):
                if (not (key
                         in best_in_groups)) or (ll > best_in_groups[key][0]):
                    best_in_groups[key] = (ll, run)

    for grouping, (ll, run) in best_in_groups.items():
        # Create configs
        config = base_config

        # Always carry over dataset and model
        config["dataset"] = run.config["dataset"]
        config["model"] = run.config["model"]

        file_name = grouping.replace("/",
                                     "_")  # Need to clean options with a /

        config["restore"] = run.id
        config["plot_prefix"] = "{}_".format(file_name)

        for field in extra_fields + groupings:
            config[field] = run.config[field]

        config_path = os.path.join(args.out_dir, "{}.json".format(file_name))
        with open(config_path, 'w') as fp:
            json.dump(config, fp, indent=0)

        print("Created config for {}".format(grouping))

    print("done")
Exemplo n.º 14
0
def test_get_database(json_file):
    with open(json_file, 'r') as f:
        json_str = f.read()
        db = utils.json_to_dict(json_str)
        return db
Exemplo n.º 15
0
def get_config():
    parser = argparse.ArgumentParser(description='Train model')
    # If config file should be used
    parser.add_argument("--config",
                        type=str,
                        help="Config file to read run config from")

    # General
    parser.add_argument("--dataset", type=str, help="Which dataset to use")
    parser.add_argument("--model", type=str, help="Which type of model to use")
    parser.add_argument(
        "--test",
        type=int,
        default=0,
        help="If model should be tested (at the end of possible training)")
    parser.add_argument("--train",
                        type=int,
                        default=1,
                        help="If model should be trained")
    parser.add_argument("--name", type=str, help="Name of the run for WandB")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="Seed for random number generator")
    parser.add_argument("--cpu",
                        type=int,
                        default=0,
                        help="Force to run on CPU")

    # Evaluation
    parser.add_argument("--test_runs",
                        type=int,
                        default=10,
                        help="Testing runs to average score for")
    parser.add_argument(
        "--restore",
        type=str,
        help="WandB run_id to restore parameters from (requires wandb logging)"
    )
    parser.add_argument("--restore_file",
                        type=str,
                        help="Path to file to restore parameters from")
    parser.add_argument("--eval_div",
                        type=str,
                        help="Evaluate model by estimating a divergence")
    parser.add_argument(
        "--eval_cgan",
        type=str,
        help="CGAN (network architecture) to use for evaluation")

    # Plotting
    parser.add_argument(
        "--scatter",
        type=int,
        default=0,
        help="If scatter-plots should be created during validation/testing")
    parser.add_argument(
        "--cond_scatter",
        type=str,
        help="Create scatter plot for conditional distribution at given x:s")
    parser.add_argument(
        "--plot_pdf",
        type=str,
        help="List of x-values to plot pdf at during validation/testing")
    parser.add_argument(
        "--plot_pdf_index",
        type=str,
        help="List of test/validation set indexes to plot pdf for")
    parser.add_argument(
        "--plot_functions",
        type=int,
        default=0,
        help=
        "Plot some sampled functions by varying x and keeping noise constant")
    parser.add_argument("--plot_gt",
                        type=int,
                        default=0,
                        help="Plot ground truth only, instead of model")
    parser.add_argument("--plot_prefix",
                        type=str,
                        help="Prefix to be prepended to plot file names")
    parser.add_argument(
        "--cond_plot_trajectories",
        type=str,
        help="""(For trajectories datasets) Plot 2D trajectory samples.
            If an index is given, plots trajectories for corresponding test sample.
            If a tuple is given, trajectories are conditioned on the tuple as x-value.
            """)
    parser.add_argument("--plot_trajectories",
                        type=int,
                        default=20,
                        help="Amount of trajectories to plot.")

    # Batched training models (i.e. neural network based)
    parser.add_argument("--epochs",
                        type=int,
                        help="How many epochs to train for",
                        default=10)
    parser.add_argument("--val_interval",
                        type=int,
                        default=10,
                        help="Evaluate model every eval_interval:th epoch")
    parser.add_argument("--batch_size",
                        type=int,
                        help="Batch size for training",
                        default=128)
    parser.add_argument(
        "--eval_batch_size",
        type=int,
        help="Batch size to use outside training, in validation etc.",
        default=1000)
    parser.add_argument("--lr", type=float, help="Learning rate", default=1e-3)
    parser.add_argument("--lr_decay",
                        type=float,
                        help="Multiplicative learning rate decay",
                        default=1.0)
    parser.add_argument("--optimizer",
                        type=str,
                        help="Optimizer to use for training",
                        default="rmsprop")

    # KDE
    parser.add_argument(
        "--kernel_scales",
        type=int,
        default=50,
        help="Amount of kernel scale parameters in KDE to try for validation")
    parser.add_argument(
        "--kernel_scale_min",
        type=float,
        default=0.001,
        help="Lower bound of allowed kernel scale range for KDE")
    parser.add_argument(
        "--kernel_scale_max",
        type=float,
        default=0.5,
        help="Upper bound of allowed kernel scale range for KDE")
    parser.add_argument(
        "--eval_samples",
        type=int,
        default=200,
        help="How many samples to draw for estimating KDE in evaluation")
    parser.add_argument("--kde_val",
                        type=int,
                        default=0,
                        help="Get KDE estimate also in validation.")
    parser.add_argument(
        "--kde_batch_size",
        type=int,
        default=10,
        help="How many kernels scales to compute KDE for at the same time")

    # CGAN
    parser.add_argument(
        "--cgan_nets",
        type=str,
        help="""Name of CGAN network specification, available specs can be
                found in cgan_specs directory.""")
    parser.add_argument("--cgan_type",
                        type=str,
                        default="standard",
                        help="""Version of CGAN training objective to use,
                see models/cgan_versions for a list""")
    parser.add_argument("--noise_dim",
                        type=int,
                        default=1,
                        help="Dimensionality of noise vector fed to generator")
    parser.add_argument("--noise_dist",
                        type=str,
                        default="gaussian",
                        help="Distribution to sample noise vector from")
    parser.add_argument("--gen_lr", type=float, help="Generator learning rate")
    parser.add_argument("--disc_lr",
                        type=float,
                        help="Discriminator learning rate")
    parser.add_argument(
        "--gen_lr_decay",
        type=float,
        help="Multiplicative learning rate decay for generator)")
    parser.add_argument(
        "--disc_lr_decay",
        type=float,
        help="Multiplicative learning rate decay for discriminator)")
    parser.add_argument("--gen_optimizer",
                        type=str,
                        help="Optimizer to use for generator training")
    parser.add_argument("--disc_optimizer",
                        type=str,
                        help="Optimizer to use for discriminator training")
    parser.add_argument(
        "--clip_grad",
        type=float,
        default=0.,
        help="Value to clip gradients at (clipping by norm). 0 is no clipping."
    )
    parser.add_argument(
        "--gen_samples",
        type=int,
        default=1,
        help=
        "How many generator samples to draw for each x in generator training")

    # GMMN (and CGMMN)
    parser.add_argument("--mmd_scales",
                        type=str,
                        default="1,5,10,20",
                        help="""Scale parameter to use in MMD-based loss
                (if specific values for x and y are not set)""")
    parser.add_argument("--mmd_scales_x",
                        type=str,
                        help="MMD scale parameter for kernel applied on x")
    parser.add_argument("--mmd_scales_y",
                        type=str,
                        help="MMD scale parameter for kernel applied on y")
    parser.add_argument(
        "--kernel_lr",
        type=float,
        default=0.01,
        help="(only GMMN) Learning rate for kernel parameter tuning")
    parser.add_argument(
        "--mmd_lambda",
        type=float,
        default=1.0,
        help=
        "(only CGMMN) Regularizer lambda to stabilize matrix inversions in MMD"
    )
    parser.add_argument("--sqrt_loss",
                        type=int,
                        default=1,
                        help="""(only CGMMN) Use square root of the loss,
                can yield better results, see Li et al.""")

    # NN-based models (mdn, nn_reg, nn_het, dctd, cgmmn, gmmn)
    parser.add_argument(
        "--network",
        type=str,
        help="""Name of network specification to use, available specs can be
            found in nn_specs directory.""")
    parser.add_argument(
        "--l2_reg",
        type=float,
        default=0.0,
        help="L2-regularization added to cost function (aka weight decay)")

    # MDN
    parser.add_argument("--mixture_comp",
                        type=int,
                        default=5,
                        help="Amount of mixture components in MDN")
    parser.add_argument(
        "--log_coefficients",
        type=int,
        default=0,
        help="If mixture coefficients should be logged to wandb")

    # GP
    parser.add_argument("--gp_kernel",
                        type=str,
                        default="rbf",
                        help="Which kernel type to use in GP")
    parser.add_argument(
        "--opt_restarts",
        type=int,
        default=0,
        help="Restarts in kernel hyperparameter optimization process")

    # DCTD
    parser.add_argument(
        "--imp_samples",
        type=int,
        default=500,
        help="Amount of importance samples used to estimate normalization Z")
    parser.add_argument(
        "--proposal_scales",
        type=str,
        default="0.5,1,5",
        help="Scales of gaussians in mixture proposal distribution")
    parser.add_argument(
        "--mode_find_steps",
        type=int,
        default=100,
        help=
        ("Amount of optimization steps in mode finding for DCTD proposal distribution"
         ))
    parser.add_argument(
        "--mode_find_lr",
        type=float,
        default=1e-2,
        help="Learning rate in mode finding for DCTD proposal distribution")
    parser.add_argument(
        "--plot_dctd_modes",
        type=int,
        default=0,
        help="Create additional scatter plot with modes of DCTD model")

    args = parser.parse_args()
    config = vars(args)

    # Read additional config from file
    if args.config:
        assert os.path.exists(args.config), "No config file: {}".format(
            args.config)
        config_from_file = utils.json_to_dict(args.config)

        # Make sure all options in config file also exist in argparse config.
        # Avoids choosing wrong parameters because of typos etc.
        unknown_options = set(config_from_file.keys()).difference(
            set(config.keys()))
        unknown_error = "\n".join([
            "Unknown option in config file: {}".format(opt)
            for opt in unknown_options
        ])
        assert (not unknown_options), unknown_error

        config.update(config_from_file)

    assert config["dataset"], "No dataset specified"
    assert config["dataset"] in dataset_list.sets, (
        "Unknown dataset: {}".format(config["dataset"]))

    assert config["model"], "No model specified"
    assert config["model"] in models, "Unknown model '{}'".format(
        config["model"])

    for split_option in [
            "plot_pdf",
            "plot_pdf_index",
            "cond_scatter",
            "mmd_scales",
            "mmd_scales_x",
            "mmd_scales_y",
            "proposal_scales",
            "cond_plot_trajectories",
    ]:
        opt_value = config[split_option]
        if opt_value:
            if "(" in opt_value:
                # entries are tuples (e.g. multi-dimensional x)
                # extra "," to always get a tuple of tuples
                parsed = ast.literal_eval(opt_value + ",")

                # Make into list of floats
                config[split_option] = [[float(e) for e in v] for v in parsed]
            else:
                # entries are single floats
                config[split_option] = [float(s) for s in opt_value.split(",")]

    return config
import gzip
import utils
import sys

wids = set()
with gzip.open(sys.argv[1]) as f:
    n = 0
    for line in f:
        d = utils.json_to_dict(line)
        w = d["workout_id"]
        if (w in wids):
            print "DUPLICATE FOUND.. workout id = " + str(w)
        wids.add(w)
        n += 1
        if (n % 100000 == 0):
            print "Done with %d workouts.." % (n)
Exemplo n.º 17
0
def generate_seed(project, bugnumber):
    """ generates a file that contains json info to run d4j and lithium """
    initial_projects = ["Chart", "Lang", "Closure", "Math", "Mockito", "Time"]
    if project not in initial_projects:
        raise Exception("Project {} invalid. Please select one of {}".format(
            project, initial_projects))

    project_path = os.path.join(os.getcwd(), "data", project)

    if not os.path.isdir(project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception("Project {} directory not found".format(project_path))

    # Solves the issue of different source paths for the same project
    if project == 'Lang' and int(bugnumber) < 36:
        source_path = get_source_path(project + '2')
    elif project == 'Math' and int(bugnumber) > 84:
        source_path = get_source_path(project + '2')
    else:
        source_path = get_source_path(project)

    # get only bugs choosen by user
    bugnumber = bugnumber.split(",")

    if not is_input_number_valid(bugnumber, project_path):
        print("FAILED")  # should print to stop the main script
        raise Exception(
            "one or more json files({}) are not found in path {}".format(
                bugnumber, project_path))

    bugnumbers = ['{}.json'.format(bug) for bug in bugnumber]

    if '0' in bugnumber:  # 0 similar to "all" bugs
        bugnumbers = os.listdir(project_path)
    else:
        bugnumbers = [
            doc for doc in os.listdir(project_path) if doc in bugnumbers
        ]

        # for each bug
    for bug in bugnumbers:
        data = json_to_dict(os.path.join(project_path, bug))
        bug_number = bug.replace(".json", "")

        # getting the expected message
        expected_dir = 'oracle/' + project_name + '/'
        if not os.path.exists(expected_dir):
            os.makedirs(expected_dir)

        expected_msg_path = expected_dir + bug_number
        project_dir = tempfile.mkdtemp(prefix="lithium-slicer_")
        output_filepath = project_dir + '/failing_tests'
        print('output_filepath=', output_filepath)
        expected_msg = []
        failing = ''

        runtest_script = "bash run_input_test.sh {PROJECTDIR} {PROJECT} {BUG}"
        cmd_str = runtest_script.format(PROJECTDIR=project_dir,
                                        PROJECT=project_name,
                                        BUG=bug_number + 'b')
        output = call_cmd(cmd_str)  # call shell script
        if os.path.isfile(output_filepath):
            with open(output_filepath) as out_fail:
                failing = out_fail.readlines()

        with open(expected_msg_path, "w+") as expected:
            expected.write("{}".format(''.join(failing)))