Пример #1
0
def run_job(input_data_path=''):
    if not input_data_path:
        logger.warning("No input data path given, gracefully exiting.")
        return
    input_bucket_name, key = input_data_path.split('/', 2)[-1].split('/', 1)
    input_bucket = S3DataStore(src_bucket_name=input_bucket_name,
                               access_key=config.AWS_S3_ACCESS_KEY_ID,
                               secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    core_data = S3DataStore(src_bucket_name='prod-bayesian-core-data',
                            access_key=config.AWS_S3_ACCESS_KEY_ID,
                            secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    packages = input_bucket.read_json_file(key)
    count = 1

    version_info = {}

    for package in packages:
        versions = sorted(core_data.list_folders(prefix=os.path.join('npm', package)), reverse=True)
        if not versions:
            logger.warning("[MISSING_DATA] Do not have data for any "
                           "versions of package {}".format(package))
            continue
        try:
            version = versions[0].split('/')[2]
        except Exception:
            logger.warning("[KEY_FORMAT] Could not get version for {}".format(package))
        version_info[package] = version
        print(count)
        count += 1
    input_bucket.write_json_file('tagging/npm/missing_data/missing_version_info.json', version_info)
Пример #2
0
def generate_evaluate_test_s3(training_url, result_id):
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_url)
    input_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                   access_key=AWS_S3_ACCESS_KEY_ID,
                                   secret_key=AWS_S3_SECRET_ACCESS_KEY)
    output_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                    access_key=AWS_S3_ACCESS_KEY_ID,
                                    secret_key=AWS_S3_SECRET_ACCESS_KEY)
    generate_test(input_data_store, output_data_store, additional_path)
    test_kronos(training_url, result_id, input_data_store, output_data_store,
                additional_path)
def generate_evaluate_test_s3(training_url, result_id):
    """Generate the test, save it, and then call all relevant checkers."""
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_url)
    input_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                   access_key=AWS_S3_ACCESS_KEY_ID,
                                   secret_key=AWS_S3_SECRET_ACCESS_KEY)
    output_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                    access_key=AWS_S3_ACCESS_KEY_ID,
                                    secret_key=AWS_S3_SECRET_ACCESS_KEY)
    generate_test(input_data_store, output_data_store, additional_path)
    perform_kronos_test(training_url, result_id, input_data_store,
                        output_data_store, additional_path)
Пример #4
0
def generate_and_save_gnosis_package_topic_model_s3(training_data_url):
    """Generate and save Gnosis package topic model into S3."""
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_data_url)
    input_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                   access_key=config.AWS_S3_ACCESS_KEY_ID,
                                   secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    generate_and_save_gnosis_package_topic_model(input_data_store=input_data_store,
                                                 output_data_store=output_data_store,
                                                 additional_path=additional_path)
def train_and_save_pruned_tag_list_s3(training_data_url):
    """Return the clean package_topic present in the given s3 training URL.
        :param training_data_url: The Location where data is read from and written to."""

    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_data_url)
    input_package_topic_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                 access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                 secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    output_package_topic_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                                  access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                  secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    return TagListPruner.prune_tag_list(input_package_topic_data_store,
                                        output_package_topic_data_store,
                                        additional_path)
Пример #6
0
def generate_and_save_gnosis_package_topic_model_s3(training_data_url):
    input_bucket_name = trunc_string_at(training_data_url, "/", 2, 3)
    output_bucket_name = trunc_string_at(training_data_url, "/", 2, 3)
    additional_path = trunc_string_at(training_data_url, "/", 3, -1)

    input_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                   access_key=config.AWS_S3_ACCESS_KEY_ID,
                                   secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    generate_and_save_gnosis_package_topic_model(
        input_data_store=input_data_store,
        output_data_store=output_data_store,
        additional_path=additional_path)
def load_package_frequency_dict_s3(bucket_name, additional_path):
    input_data_store = S3DataStore(src_bucket_name=bucket_name,
                                   access_key=config.AWS_S3_ACCESS_KEY_ID,
                                   secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    package_frequency_dict = load_package_frequency_dict(
        input_data_store, additional_path)
    return package_frequency_dict
def load_user_eco_to_kronos_model_dict_s3(bucket_name, additional_path):
    input_data_store = S3DataStore(src_bucket_name=bucket_name,
                                   access_key=config.AWS_S3_ACCESS_KEY_ID,
                                   secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    user_eco_to_kronos_model_dict = load_user_eco_to_kronos_model_dict(
        input_kronos_data_store=input_data_store,
        additional_path=additional_path)
    return user_eco_to_kronos_model_dict
def load_eco_to_kronos_dependency_dict_s3(bucket_name, additional_path):
    """Load the Kronos dependency dictionary from the AWS S3 storage."""
    input_data_store = S3DataStore(src_bucket_name=bucket_name,
                                   access_key=config.AWS_S3_ACCESS_KEY_ID,
                                   secret_key=config.AWS_S3_SECRET_ACCESS_KEY)
    eco_to_kronos_dependency_dict = load_eco_to_kronos_dependency_dict(
        input_kronos_dependency_data_store=input_data_store, additional_path=additional_path)

    return eco_to_kronos_dependency_dict
Пример #10
0
def generate_and_save_kronos_dependency_s3(training_data_url):
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_data_url)
    input_gnosis_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                          access_key=config.AWS_S3_ACCESS_KEY_ID,
                                          secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    input_package_topic_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                 access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                 secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    generate_and_save_kronos_dependency(
        input_gnosis_data_store=input_gnosis_data_store,
        input_package_topic_data_store=input_package_topic_data_store,
        output_data_store=output_data_store, additional_path=additional_path)
Пример #11
0
def generate_and_save_cooccurrence_matrices_s3(training_data_url):
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_data_url)
    input_kronos_dependency_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                     access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                     secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    input_manifest_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                            access_key=config.AWS_S3_ACCESS_KEY_ID,
                                            secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    generate_and_save_cooccurrence_matrices(
        input_kronos_dependency_data_store=input_kronos_dependency_data_store,
        input_manifest_data_store=input_manifest_data_store,
        output_data_store=output_data_store, additional_path=additional_path)
def train_and_save_kronos_list_s3(training_data_url):
    """Train the Kronos and save the results into the AWS S3 storage."""
    input_bucket_name, output_bucket_name, additional_path = get_path_names(
        training_data_url)
    input_kronos_dependency_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                     access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                     secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    input_cooccurrence_matrix_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                       access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                       secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    train_and_save_kronos_list(
        input_kronos_dependency_data_store=input_kronos_dependency_data_store,
        input_co_occurrence_data_store=input_cooccurrence_matrix_data_store,
        output_data_store=output_data_store, additional_path=additional_path)
Пример #13
0
def train_and_save_kronos_list_s3(training_data_url):
    input_bucket_name = trunc_string_at(training_data_url, "/", 2, 3)
    output_bucket_name = trunc_string_at(training_data_url, "/", 2, 3)
    additional_path = trunc_string_at(training_data_url, "/", 3, -1)

    input_kronos_dependency_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                     access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                     secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    input_cooccurrence_matrix_data_store = S3DataStore(src_bucket_name=input_bucket_name,
                                                       access_key=config.AWS_S3_ACCESS_KEY_ID,
                                                       secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    output_data_store = S3DataStore(src_bucket_name=output_bucket_name,
                                    access_key=config.AWS_S3_ACCESS_KEY_ID,
                                    secret_key=config.AWS_S3_SECRET_ACCESS_KEY)

    train_and_save_kronos_list(
        input_kronos_dependency_data_store=input_kronos_dependency_data_store,
        input_co_occurrence_data_store=input_cooccurrence_matrix_data_store,
        output_data_store=output_data_store, additional_path=additional_path)
Пример #14
0
def load_model():
    print "Loading model ..."
    bucket_name = config.AWS_BUCKET
    access_key_id = config.AWS_ACCESS_KEY_ID
    secret_access_key = config.AWS_SECRET_ACCESS_KEY
    print "Read the config values ..."
    print "------------"
    print access_key_id
    print "------------"
    model_data_store = S3DataStore(src_bucket_name=bucket_name.strip(),
                                   access_key=access_key_id.strip(),
                                   secret_key=secret_access_key.strip())
    assert (model_data_store is not None)

    app.movie_reco_model = MovieRecommender.load_from_data_store(data_store=model_data_store)
    assert app.movie_reco_model is not None

    app.logger.info("movie recommendation model got loaded successfully!")
Пример #15
0
def load_rec_model_s3():
    data_store = S3DataStore(src_bucket_name=AWS_BUCKET_NAME,
                             access_key=AWS_S3_ACCESS_KEY_ID,
                             secret_key=AWS_S3_SECRET_ACCESS_KEY)
    rec_sys = ImdbRecSys.load(data_store=data_store)
    return rec_sys