示例#1
0
def get_value(line, input_type):
    if input_type == "ast":
        return get_dfs(line)
    elif input_type == "leaf":
        return get_dfs(line, only_leaf=True)
    elif input_type == "source_code":
        return line[0]
def main():
    parser = argparse.ArgumentParser(description="Generate datapoints from AST")
    parser.add_argument("--ast_fp", "-a", help="Filepath with the ASTs to be parsed")
    parser.add_argument(
        "--out_fp", "-o", default="/tmp/dps.txt", help="Filepath for the output dps"
    )
    parser.add_argument(
        "--n_ctx", "-c", type=int, default=1000, help="Number of contexts for each dp"
    )
    args = parser.parse_args()
    if os.path.exists(args.out_fp):
        os.remove(args.out_fp)
    logging.info("Number of context: {}".format(args.n_ctx))

    num_dps = 0
    logging.info("Loading asts from: {}".format(args.ast_fp))
    with open(args.ast_fp, "r") as f, open(args.out_fp, "w") as fout:
        for line in file_tqdm(f):
            dp = json.loads(line.strip())
            asts = separate_dps(dp, args.n_ctx)
            for ast, extended in asts:
                if len(ast) > 1:
                    json.dump([get_dfs(ast), extended], fp=fout)
                    fout.write("\n")
                    num_dps += 1

    logging.info("Wrote {} datapoints to {}".format(num_dps, args.out_fp))
示例#3
0
def parse():
    """
    Parse args and handles list splitting
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--grouping')
    parser.add_argument('--dataset', default='ml-1m')
    args = parser.parse_args()
    dfs = get_dfs(args.dataset)
    if args.grouping == 'genre':
        groups = group_by_genre(dfs['users'], dfs['ratings'], dfs['movies'],
                                args.dataset)
    elif args.grouping == 'power':
        groups = group_by_power(dfs['users'], dfs['ratings'], args.dataset)
    elif args.grouping == 'state':
        groups = group_by_state(dfs['users'], args.dataset)
    else:
        grouping_to_func = {
            'gender': group_by_gender,
            'age': group_by_age,
            'occupation': group_by_occupation,
        }
        groups = grouping_to_func[args.grouping](dfs['users'])
    for group in groups:
        print(group['name'], len(group['df'].index))
    print(len(groups))
def get_dp(dp, n_ctx, child=False):
    get_mask = get_udc_masks if child else get_ud_masks
    asts = separate_dps(dp, n_ctx)
    rel_masks = separate_rel_mask(get_mask(dp, n_ctx), n_ctx)
    aug_dps = []
    for (ast, ext), mask in zip(asts, rel_masks):
        aug_dps.append([get_dfs(ast), ext, mask])
    return aug_dps
def main(args):
    """driver"""
    dfs = get_dfs(args.dataset)
    ratings_df = dfs['ratings']
    movies_df = dfs['movies']

    d = defaultdict(int)
    for i, row in ratings_df.iterrows():
        d[row.movie_id] += 1
    
    c = Counter(d)
    top_five_percent = c.most_common(len(d) // 20)
    print(top_five_percent)
    movie_ids = [movie_id[0] for movie_id in top_five_percent]
    with open('boycott_files/{}_top_five_percent_movies.csv'.format(args.dataset), 'w') as outfile:
        outfile.write(','.join([str(x) for x in movie_ids]))
    for movie_id in movie_ids:
        print(movies_df[movies_df.movie_id == movie_id].movie_title)
def external(file_path, suffix, context_size, overlap):
    outfile = "output/{}_dps.txt".format(suffix)
    if os.path.exists(outfile):
        os.remove(outfile)
    logging.info("Number of context: {}".format(context_size))

    num_dps = 0
    logging.info("Loading asts from: {}".format(file_path))
    with open(file_path, "r") as f, open(outfile, "w") as fout:
        for line in file_tqdm(f):
            dp = json.loads(line.strip())
            asts = rq6_separate_dps(dp, context_size, overlap)
            for ast, extended in asts:
                if len(ast) > 1:
                    json.dump([get_dfs(ast), extended], fp=fout)
                    fout.write("\n")
                    num_dps += 1

    logging.info("Wrote {} datapoints to {}".format(num_dps, outfile))
示例#7
0
def main(args):
    """
    Run the sandbox experiments
    """
    out_prefix = 'out/' if args.send_to_out else ""
    times = OrderedDict()
    times['start'] = time.time()
    algos = ALGOS
    if args.movie_mean:
        algos = {
            'MovieMean': MovieMean(),
            'GlobalMean': GlobalMean(),
        }
    algos_for_standards = ALGOS_FOR_STANDARDS
    dfs = get_dfs(args.dataset)
    head_items = load_head_items(args.dataset)
    times['dfs_loaded'] = time.time() - times['start']
    print('Got dataframes, took {} seconds'.format(times['dfs_loaded']))
    print('Total examples: {}'.format(len(dfs['ratings'].index)))

    ratings_df, users_df, movies_df = dfs['ratings'], dfs['users'], dfs['movies']
    if args.mode == 'info':
        print(ratings_df.memory_usage(index=True))
        print(users_df.memory_usage(index=True))
        print(movies_df.memory_usage(index=True))

        print(ratings_df.info())
        print(users_df.info())
        return
    data = Dataset.load_from_df(
        ratings_df[['user_id', 'movie_id', 'rating']],
        reader=Reader()
    )
    times['data_constructed'] = time.time() - times['dfs_loaded']

    # note to reader: why are precision, recall, and ndcg all stuffed together in one string?
    # this ensures they will be computed all at once. Evaluation code will split them up for presentation
    metric_names = []
    for measure in MEASURES:
        if '_' in measure:
            splitnames = measure.lower().split('_')
            metric_names += splitnames
            metric_names += [x + '_frac' for x in splitnames]
            metric_names += ['tail' + x for x in splitnames]
        else:
            metric_names.append(measure.lower())
    metric_names = get_metric_names()
    if args.compute_standards:
        standard_results = defaultdict(list)
        for algo_name in algos_for_standards:
            for _ in range(args.num_standards):
                filename_ratingcv_standards = out_prefix + 'standard_results/{}_ratingcv_standards_for_{}.json'.format(
                    args.dataset, algo_name)

                print('Computing standard results for {}'.format(algo_name))
                if args.save_path is False:
                    save_path = None
                elif args.save_path is None:
                    save_path = os.getcwd() + '/' + out_prefix + 'predictions/standards/{}_{}_'.format(args.dataset, algo_name)
                else:
                    save_path = args.save_path

                if 'KNN' in algo_name and args.dataset == 'ml-20m':
                    # running this in parallel runs out of memory with KNN
                    results = cross_validate_custom(
                        algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(),
                        reader=Reader()), [], [], MEASURES, NUM_FOLDS, n_jobs=1, head_items=head_items,
                        save_path=save_path)
                else:
                    results = cross_validate_custom(
                        algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(),
                        reader=Reader()), [], [], MEASURES, NUM_FOLDS, head_items=head_items,
                        save_path=save_path)
                saved_results = {}
                for metric in metric_names:
                    saved_results[metric] = np.mean(results[metric + '_all'])
                    # frac_key = metric + '_frac_all'
                    # if frac_key in results:
                    #     saved_results[frac_key] = np.mean(results[frac_key])

                with open(filename_ratingcv_standards, 'w') as f:
                    json.dump(saved_results, f)
                    
                standard_results[algo_name].append(saved_results)
            standard_results_df = pd.DataFrame(standard_results[algo_name])
            print(standard_results_df.mean())
            standard_results_df.mean().to_csv('{}'.format(
                filename_ratingcv_standards).replace('.json', '_{}.csv'.format(
                    args.num_standards)
                )
            )

    experiment_configs = []
    if args.grouping == 'individual_users':
        experiment_configs += [{'type': 'individual_users', 'size': None}]
    elif args.grouping == 'sample':
        if args.sample_sizes:
            experiment_configs += [
                {
                    'type': 'sample_users', 'size': sample_size
                } for sample_size in args.sample_sizes]
        else:
            raise ValueError(
                'When using grouping="sample", you must provide a set of sample sizes'
            )
    elif args.grouping in [
        'gender', 'age', 'power', 'state', 'genre', 'genre_strict', 'occupation', 
    ]:
        experiment_configs += [{'type': args.grouping, 'size': None}]
    else:
        experiment_configs = []


    uid_to_error = {}
    experimental_iterations = []
    seed_base = args.indices[0]
    for config in experiment_configs:
        outname = out_prefix + concat_output_filename(
            args.dataset, config['type'], args.userfrac,
            args.ratingfrac,
            config['size'], args.num_samples, args.indices
        )
        if config['type'] == 'individual_users':
            experimental_iterations = list(users_df.iterrows())
        elif config['type'] == 'sample_users':
            experimental_iterations = [{
                'df': users_df.sample(config['size'], random_state=seed_base+index), # copies user_df
                'name': '{} user sample'.format(config['size'])
            } for index in range(args.num_samples)]
        elif config['type'] == 'gender':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_gender(users_df)
        elif config['type'] == 'age':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_age(users_df)
        elif config['type'] == 'state':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_state(users_df, dataset=args.dataset)
        elif config['type'] == 'genre':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_genre(
                    users_df=users_df, ratings_df=ratings_df, movies_df=movies_df,
                    dataset=args.dataset)
        elif config['type'] == 'genre_strict':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_genre_strict(
                    users_df=users_df, ratings_df=ratings_df, movies_df=movies_df,
                    dataset=args.dataset)
        elif config['type'] == 'power':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_power(users_df=users_df, ratings_df=ratings_df, dataset=args.dataset)
        elif config['type'] == 'occupation':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_occupation(users_df)

        experiment_identifier_to_uid_sets = {}
        for algo_name in algos:
            prep_boycott_tasks = (
                delayed(prepare_boycott_task)(
                    i, experimental_iteration, args, config,
                    ratings_df, seed_base,
                    outname, algo_name, algos[algo_name], head_items, data
                ) for i, experimental_iteration in enumerate(experimental_iterations)
            )
            simulate_boycott_tasks = []
            tic = time.time()
            out = Parallel(n_jobs=-1, verbose=5, max_nbytes=None)((x for x in prep_boycott_tasks))
            for task_args, d in out:
                simulate_boycott_tasks.append(delayed(task)(*task_args))
                experiment_identifier_to_uid_sets.update(d)
            print('parallelized prep_boycott_task took {} seconds'.format(time.time() - tic))
            print('About to run Parallel() with {} tasks'.format(len(simulate_boycott_tasks)))
            out_dicts = Parallel(n_jobs=-1, verbose=5)((x for x in simulate_boycott_tasks))
            for d in out_dicts:
                res = d['subset_results']
                algo_name = d['algo_name']
                uid = str(d['identifier']) + '_' + d['algo_name']
                uid_to_error[uid] = {
                    'num_ratings': d['num_ratings'],
                    'num_users': d['num_users'],
                    'num_movies': d['num_movies'],
                    'name': d['name'],
                    'algo_name': d['algo_name'],
                }
                for metric in metric_names + ['fit_time', 'test_times', 'num_tested']:
                    for group in ['all', 'non-boycott', 'boycott', 'like-boycott', 'all-like-boycott']:
                        key = '{}_{}'.format(metric, group)
                        # if group in ['boycott', ]:
                        #     val = np.nanmean(res[key])
                        vals = res.get(key)
                        if vals:
                            val = np.mean(res[key])
                            uid_to_error[uid].update({
                                key: val,
                            })
                        standards_key = 'standards_' + key
                        standards_vals = res.get(standards_key)
                        if standards_vals:
                            standards_val = np.mean(res[standards_key])
                            uid_to_error[uid].update({
                                standards_key: standards_val,
                            })
        err_df = pd.DataFrame.from_dict(uid_to_error, orient='index')
        uid_sets_outname = outname.replace('results/', 'uid_sets/uid_sets_')
        pd.DataFrame.from_dict(experiment_identifier_to_uid_sets, orient='index').to_csv(uid_sets_outname)
        if args.movie_mean:
            outname = outname.replace('results/', 'results/MOVIEMEAN_')
        err_df.to_csv(outname)
        print('Full runtime was: {} for {} experimental iterations'.format(time.time() - times['start'], len(experimental_iterations)))
示例#8
0
def main(args):
    """
    Calculate standards

    Configuration requqired for this function:
      you must have uid_sets files in the directory specified by the pathto argument
      uid_sets files are CSV files with:
        an iteration number (index) in one column, a list of boycott uids in 2nd column, and a list of like-boycott uids in a 3rd column
        uid lists are stored as strings delimited by semi-colon (;)
    """
    starttime = time.time()
    dfs = get_dfs(args.dataset)
    head_items = load_head_items(args.dataset)
    ratings_df = dfs['ratings']
    data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']],
                                reader=Reader())

    files = os.listdir(args.pathto)

    boycott_uid_sets = {}
    like_boycotters_uid_sets = {}

    for file in files:
        if 'uid_sets' not in file or '.csv' not in file:
            continue
        if args.dataset not in file:
            #print('skip {} b/c dataset'.format(file))
            continue
        if args.name_match and args.name_match not in file:
            continue
        print(file)
        uid_sets_df = pd.read_csv(args.pathto + '/' + file, dtype=str)
        for i, row in uid_sets_df.iterrows():
            identifier_num = row[0]
            try:
                boycott_uid_set = set(
                    [int(x) for x in row['boycott_uid_set'].split(';')])
            except AttributeError:
                boycott_uid_set = set([])
            try:
                like_boycotters_uid_set = set([
                    int(x) for x in row['like_boycotters_uid_set'].split(';')
                ])
            except AttributeError:
                like_boycotters_uid_set = set([])

            full_identifier = file.replace('uid_sets_',
                                           '') + '__' + identifier_num
            boycott_uid_sets[full_identifier] = boycott_uid_set
            like_boycotters_uid_sets[full_identifier] = like_boycotters_uid_set

    # now boycott_uid_sets and co. are filled up!
    if args.algo_name:
        algo_names = [args.algo_name]
    else:
        algo_names = list(ALGOS.keys())
    out = {}
    for algo_name in algo_names:
        # why do we batch this - otherwise we could run out of memory if doing many experiment with one script run
        for batch_num, key_batch in enumerate(
                batch(list(boycott_uid_sets.keys()), 100)):
            print('On key batch {} of {} keys'.format(batch_num,
                                                      len(boycott_uid_sets)))
            batch_b = {}
            batch_l = {}
            for key in key_batch:
                print(key)
                batch_b[key] = boycott_uid_sets[key]
                batch_l[key] = like_boycotters_uid_sets[key]

            # ideally we don't need to re-train the algorithm... we have the actual predictions saved for each rating within each crossfold!
            # if for some reason this was lost (or wasn't saved, e.g. using the pre-July 2018 version of this code) we can re-train
            # will take much longer
            if args.load_path == 'False':
                load_path = None
            elif args.load_path is None:
                load_path = os.getcwd(
                ) + '/predictions/standards/{}_{}_'.format(
                    args.dataset, algo_name)
            else:
                load_path = args.load_path + '/standards/{}_{}_'.format(
                    args.dataset, algo_name)
            res = cross_validate_many(ALGOS[algo_name],
                                      data,
                                      Dataset.load_from_df(pd.DataFrame(),
                                                           reader=Reader()),
                                      batch_b,
                                      batch_l,
                                      MEASURES,
                                      NUM_FOLDS,
                                      verbose=False,
                                      head_items=head_items,
                                      load_path=load_path)
            out.update(res)
            dtstr = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
            with open(
                    'standard_results/{}_{}_{}.json'.format(
                        args.dataset, algo_name, dtstr), 'w') as f:
                json.dump(out, f)
    print(time.time() - starttime)