Exemplo n.º 1
0
def make_processed_list(s):
    processed_list = utils.read_from_s3("processed_list",
                                        seed=[],
                                        directory=s["s3dir"])
    # if not processed_list:
    #     statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory=s["s3dir"])
    #     processed_list = [int(status["id"]) for status in statuses_enhanced]
    return processed_list
Exemplo n.º 2
0
def control(batch_enhanced,s):
    # statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory='data-aws/gen_two/')
    batch_enhanced_full = utils.read_from_s3(utils.file_name( sufix = "_batch_enhanced_full"), seed=[], directory=s["s3dir"])
    print("Length of enhanced pre batch: " + str(len(batch_enhanced_full)))
    # enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced)
    # utils.write_to_s3(enhanced_w_batch, utils.file_name( sufix = "_batch_enhanced_full"), directory=s["s3dir"])
    # utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list"), directory=s["s3dir"])

    filtered_label_dict = make_filtered_label_dict(batch_enhanced_full)

    # utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_"), directory=s["s3dir"])
    return filtered_label_dict
def control(batch_enhanced, s):
    batch_enhanced_full = utils.read_from_s3(
        utils.file_name(sufix="_batch_enhanced_full"),
        seed=[],
        directory=s["s3dir"])
    print("Length of enhanced pre batch: " + str(len(batch_enhanced_full)))
    enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced)
    utils.write_to_s3(enhanced_w_batch,
                      utils.file_name(sufix="_batch_enhanced_full"),
                      directory=s["s3dir"])

    filtered_label_dict = make_filtered_label_dict(batch_enhanced_full)
    return filtered_label_dict
    # all_users_lookup = [u for u in all_users if u not in user_data]
    ## Cache code##
    user_data, all_users_lookup = utils.get_from_cache_m(
        all_users, "user_data")
    utils.log(len(all_users), "Number total users in set: ")
    utils.log(len(all_users_lookup), "Number users needing lookup: ")
    if len(all_users_lookup) > 0:
        user_chunks = make_user_chunks(all_users_lookup, 100)
        for this_lookup in user_chunks:
            user_dict_lookup = do_user_lookup(this_lookup, s)
            user_data.update(user_dict_lookup)
    return user_data, filtered_label_dict


def control(filtered_label_dict, s):
    user_data, filtered_label_dict = update_user_info(filtered_label_dict, s)
    filtered_label_dict = update_dict_with_user_info(filtered_label_dict,
                                                     user_data)
    utils.write_to_s3(
        filtered_label_dict,
        utils.file_name(prefix='_batch_filt_label_dict_enhanced_'),
        directory=s["s3dir"])
    return filtered_label_dict


if __name__ == "__main__":
    sd = utils.getDefaultSettings()
    filtered_label_dict = utils.read_from_s3(
        utils.file_name(prefix='_batch_filt_label_dict_enhanced_'),
        directory=sd["s3dir"])
    control(filtered_label_dict, sd)
Exemplo n.º 5
0
        # if "thumb_fetch_successful" not in titles[url]["news_image"] and 'media_url_news' in titles[url]["news_image"]:
        if 'media_url_news' in titles[url]["news_image"]:
            top_image = titles[url]["news_image"]['media_url_news']
            print("Need image for " + top_image + ", " + url)
            ## CACHE CHECKING
            f_base = utils.get_from_cache(top_image, "thumbs", print_hits=True)
            if f_base == None:
                f_base = make_thumb(top_image)
            titles[url]["news_image"] = f_base
        else:
            print("already tried fetching thumb for " + url)
    return titles


def control(label_to_titles, s):
    utils.log(str(len(label_to_titles)) + " titles loaded")
    label_to_titles = add_image_thumbs(label_to_titles, s)

    utils.write_to_s3(label_to_titles,
                      'batch_titles_fd_' + utils.file_date() + '.json',
                      directory=s["s3dir"])
    return label_to_titles


if __name__ == "__main__":
    sd = utils.getDefaultSettings()
    label_to_titles = utils.read_from_s3('batch_titles_fd_' +
                                         utils.file_date() + '.json',
                                         directory=sd["s3dir"])
    control(label_to_titles, sd)
Exemplo n.º 6
0
                max_embed = this_embed
            #for links, add to "labels" so we can do the short url processing after
            #for refs, create entry "labels_proc so we preserve the original ref and can store the quoted status.
            root["satellite_enhanced"]["labels"]["quoted_labels_links_deep"] = max_url
            root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep"] = max_embed
            if max_embed != None and max_embed != []:
                root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep_status"] = qt
        else:
            existing_cnt += 1
    utils.log(existing_cnt, "Existing Count: ")
    return statuses_enhanced


def control(batch_enhanced, s):
    utils.log("", "Starting deep trace")
    # batch_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory="data-aws/gen_two/")
    batch_enhanced = trace_links_down(batch_enhanced, s)
    # utils.write_to_s3(
    #     batch_enhanced,
    #     utils.file_name(batch_enhanced, sufix="_batch_enhanced_c"),
    #     directory=s["s3dir"])
    return batch_enhanced


if __name__ == "__main__":
    sd = utils.getDefaultSettings()
    batch_enhanced = utils.read_from_s3(
        utils.file_name(sufix="_batch_enhanced"), directory=sd["s3dir"])

    control(batch_enhanced, sd)
 
Exemplo n.º 7
0
    label_data = process_label_dict(filt_label_dict, label_to_titles)
    sorted_label_data_list = label_data_to_list(label_data)
    meta = make_meta(label_data)
    final_data = {"label_data":sorted_label_data_list, "meta_data": meta}
    # name = "d3-" + utils.file_date() + "-label_format" + ".json"
    # utils.write_to_s3(
    #     json.dumps(final_data),
    #     name,
    #     directory=s["s3dir"] + 'production/',
    #     public=True)
    
    normalized_name = s["name"] + ".json"
    utils.write_to_s3(
        json.dumps(final_data),
        normalized_name,
        directory='data-aws/shared_data/production/'+utils.file_date() + "/",
        public=True)
    return None


if __name__ == "__main__":
    debug = True
    sd = utils.getDefaultSettings()
    filt_label_dict = utils.read_from_s3(
        utils.file_name(prefix='_batch_filt_label_dict_enhanced_'),
        directory=sd["s3dir"])
    label_to_titles = utils.read_from_s3(
        'batch_titles_fd_' + utils.file_date() + '.json',
        directory=sd["s3dir"])
    control(filt_label_dict, label_to_titles, sd)
Exemplo n.º 8
0
def filter_on_day(stat):
    current_day_key = utils.file_date()
    dd = utils.make_local(stat["created_at"])
    key = str(dd.day) + "-" + str(dd.month) + "-" + str(dd.year)
    return_bool = key == current_day_key
    return return_bool

def enhance(batch_enhanced):
    filtered_enhanced_batch = []
    utils.log(len(batch_enhanced), "Number batch statuses: ")
    for stat in batch_enhanced:
        enhanced = {}
        enhanced["created_at_tz"] = add_time_zone_date(stat)
        enhanced["labels"] = get_combined_labels(stat)
        ref_cnt = 0
        for key, val in enhanced["labels"].items():
            ref_cnt += len(val) 
        if ref_cnt > 0:
            stat["satellite_enhanced"] = enhanced
            filtered_enhanced_batch.append(stat)
    utils.log(len(filtered_enhanced_batch), "Number enhanced batch statuses: ")
    return filtered_enhanced_batch

def control(date_filtered_batch, s):
    batch_enhanced = enhance(date_filtered_batch)
    return batch_enhanced

if __name__ == "__main__":
    sd = utils.getDefaultSettings()
    date_filtered_batch = utils.read_from_s3(utils.file_name(sufix="_batch_enhanced_full"), directory=sd["s3dir"])
    control(date_filtered_batch, sd)
Exemplo n.º 9
0
def control(batch_enhanced,s):    
    filtered_label_dict = utils.read_from_s3(utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"])
    filtered_label_dict, processed_list = make_filtered_label_dict(batch_enhanced, filtered_label_dict)
    utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"])
    utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list_fld"), directory=s["s3dir"])
    return filtered_label_dict
Exemplo n.º 10
0
def report(job_id, x_axis, y_axis, min_members=None):
    """
    Generate report for a job

    Parameters
    ----------
    job_id: str
    x_axis: str
        Name of column from user dataset to be used for the x axis of the plot
    y_axis: str
        Name of column from user dataset to be used for the y axis of the plot
    min_members: int, optional
        Minimum number of members required in all clusters in an experiment to consider the experiment for the report.

    Returns
    -------
    html
    """
    # job_id is valid
    tasks = dynamo_get_tasks(job_id)
    if len(tasks) == 0:
        return 'No task found'
    n_tasks_done = len([x for x in tasks if x['task_status'] == 'done'])
    if len(tasks) != n_tasks_done:
        return 'All tasks not completed yet for job ID: {} {}/{}'.format(
            job_id, n_tasks_done, len(tasks))

    # all tasks are done
    if min_members is None:
        min_members = 10
    tasks = filter_by_min_members(tasks, min_members=min_members)
    start_time_date, start_time_clock = format_date_time(
        tasks[0]['created_time'])

    covar_types, covar_tieds, ks, labels, bics, task_ids = tasks_to_best_results(
        tasks)

    if x_axis is None or y_axis is None:
        # Visualize the first two columns that are not on the exclude list
        viz_columns = [
            c for c in job['columns']
            if c.lower().strip() not in EXCLUDE_COLUMNS
        ][:2]
    else:
        viz_columns = [x_axis, y_axis]

    data, columns = read_from_s3(job_id, 0, S3_BUCKET, tasks[0]['s3_file_key'])
    spatial_columns = [c for c in columns if c.lower() in SPATIAL_COLUMNS][:2]

    # recommendations for all covariance types
    covar_type_tied_k = {}
    for covar_type in covar_types:
        covar_type_tied_k[covar_type.capitalize()] = {}

    for covar_type, covar_tied, k in zip(covar_types, covar_tieds, ks):
        covar_type_tied_k[covar_type.capitalize()][['Untied',
                                                    'Tied'][covar_tied]] = k

    # task_id for all recommended assignments
    covar_type_tied_task_id = {}
    for covar_type in covar_types:
        covar_type_tied_task_id[covar_type.capitalize()] = {}

    for covar_type, covar_tied, task_id in zip(covar_types, covar_tieds,
                                               task_ids):
        covar_type_tied_task_id[covar_type.capitalize()][[
            'Untied', 'Tied'
        ][covar_tied]] = task_id

    result = dict(job_id=job_id,
                  min_members=min_members,
                  covar_type_tied_k=covar_type_tied_k,
                  covar_type_tied_task_id=covar_type_tied_task_id,
                  columns=columns,
                  viz_columns=viz_columns,
                  spatial_columns=spatial_columns,
                  start_time_date=start_time_date,
                  start_time_clock=start_time_clock)
    return result