예제 #1
0
def main():
    save_dir = "../data"
    image_file = "../data/test/karyotype.bmp"
    model_path = "../model/default_inference.h5"
    Pipeline.run(image_file=image_file,
                 save_dir=save_dir,
                 model_path=model_path)
예제 #2
0
def test_pipeline_produced_expected_data() -> bool:
    delete_existing_outputs(STORAGE_CONFIG)

    filename = os.path.basename(EXPECTED_FILE)
    pipeline = Pipeline(PIPELINE_CONFIG, STORAGE_CONFIG)
    pipeline.run(EXAMPLE_FILE)

    # Retrieve the output data file
    loc_id = pipeline.config.pipeline_definition.location_id
    datastream = DSUtil.get_datastream_name(config=pipeline.config)
    root: str = pipeline.storage._root
    output_file = os.path.join(root, loc_id, datastream, filename)

    # Assert that the basename of the processed file and expected file match
    assert os.path.isfile(output_file)

    # Compare data and optionally attributes to ensure everything matches.
    ds_out: xr.Dataset = xr.open_dataset(output_file)
    ds_exp: xr.Dataset = xr.open_dataset(EXPECTED_FILE)

    xr.testing.assert_allclose(ds_out, ds_exp)
예제 #3
0
def main():
    inputs = {
        'channel_id': CHANNEL_ID,
        'search_word': 'incredible',
        'limit': 20,

    }
    steps = [
        Preflight(),
        GetVideoList(),  # 写成多行,增加易读性(最后一个建议有,)
        InitializeYT(),
        DownloadCaptions(),
        ReadCaption(),
        Search(),
        DownloadVideos(),
        EditVideo(),
        Postflight(),
    ]

    utils = Utils()
    p = Pipeline(steps)
    p.run(inputs, utils)
    """)

    cursor.execute("""
        CREATE USER vsmith WITH PASSWORD 'temppass123' NOSUPERUSER IN GROUP data_analyst;
    """)

    db_conn.commit()

    print("CREATED DB GROUP AND USERS")

    return db_conn


@pipeline.task(depends_on=create_db_users_and_groups)
def close_db_connection(db_conn):
    """After the work is done, close the database connection."""
    db_conn.close()


def get_table_row_count(db_conn, table_name):
    """Get basic table row count."""
    cursor = db_conn.cursor()

    cursor.execute("SELECT COUNT(1) FROM {}".format(table_name))

    return cursor.fetchone()[0]


if __name__ == '__main__':
    pipeline.run()
예제 #5
0
def main():
    config = Config()
    parser = argparse.ArgumentParser(
        description='Code for building the Gutenberg Dialog Dataset')
    parser.add_argument('-dg',
                        '--dialog_gap',
                        default=config.dialog_gap,
                        help='Min. number of characters between two dialogs ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-isn',
        '--include_surrounding_narratives',
        default=config.include_surrounding_narratives,
        help='Whether to include surrounding narratives in the output dataset',
        action='store_true')
    parser.add_argument('-mnl',
                        '--max_narrative_length',
                        default=config.max_narrative_length,
                        help='Max. number of words in 1 narrative ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-minl',
        '--min_intermediate_narrative_length',
        default=config.min_intermediate_narrative_length,
        help=
        'Min. number of words in 1 intermediate narrative (a narrative which occurs in-line with dialog) '
        + '(default: %(default)s)',
        metavar='',
        type=int)
    parser.add_argument('-mul',
                        '--max_utterance_length',
                        default=config.max_utterance_length,
                        help='Max. number of words in 1 utterance ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mb',
                        '--max_books',
                        default=config.max_books,
                        help='Limit the number of books in final dataset ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-md',
                        '--min_delimiters',
                        default=config.min_delimiters,
                        help='Min delimiters / 10000 words needed in a book ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mdd',
                        '--min_double_delim',
                        default=config.min_double_delim,
                        help='Double delimiter threshold (romance languages ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-kl',
                        '--kl_threshold',
                        default=config.kl_threshold,
                        help='KL divergence threshold for filtering books ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-st',
                        '--size_threshold',
                        default=config.size_threshold,
                        help='#words threshold for filtering with KL' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-cd',
                        '--clean_dialogs',
                        default=config.clean_dialogs,
                        help='Whether to run pre-processing on dialogs',
                        action='store_true')
    parser.add_argument('-vt',
                        '--vocab_threshold',
                        default=config.vocab_threshold,
                        help='Ratio of unknown words allowed in a dialog ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-l',
                        '--languages',
                        default=config.languages,
                        help='Comma separated language codes ' +
                        'for which to build datasets',
                        metavar='',
                        type=str)
    parser.add_argument('-d',
                        '--download',
                        default=config.download,
                        help='Whether to run download step',
                        action='store_true')
    parser.add_argument('-f1',
                        '--pre_filter',
                        default=config.pre_filter,
                        help='Whether to run pre-filter step',
                        action='store_true')
    parser.add_argument('-e',
                        '--extract',
                        default=config.extract,
                        help='Whether to run extracting step',
                        action='store_true')
    parser.add_argument('-f2',
                        '--post_filter',
                        default=config.post_filter,
                        help='Whether to run post filter step',
                        action='store_true')
    parser.add_argument('-c',
                        '--create_dataset',
                        default=config.create_dataset,
                        help='Whether to run create dataset step',
                        action='store_true')
    parser.add_argument('-a',
                        '--run_all',
                        default=config.run_all,
                        help='Whether to run all steps',
                        action='store_true')
    parser.add_argument('-dir',
                        '--directory',
                        default=config.directory,
                        help='Directory where the language folders are',
                        metavar='',
                        type=str)

    parser.parse_args(namespace=config)
    p = Pipeline(config)
    p.run()
    keywords = {}
    for title in titles:
        for word in title.split():
            if word and word not in exclude_words:
                if word not in keywords:
                    keywords[word] = 0
                keywords[word] += 1
    return keywords


@pipeline.task(depends_on=build_keyword_dictionary)
def extract_top_keywords(keywords):
    top_keywords = []
    for word, count in sorted(keywords.items(),
                              key=lambda item: item[1],
                              reverse=True):
        top_keywords.append((word, count))

    return top_keywords[:100]


@pipeline.task(depends_on=extract_top_keywords)
def save_final_csv_file(keywords):
    output_csv_file = open('top_keywords.csv', 'w', newline='')
    return csv_helper.build_csv_file(keywords,
                                     file=output_csv_file,
                                     header=['keyword', 'count'])


output = pipeline.run()