示例#1
0
def main():
    save_dir = "../data"
    image_file = "../data/test/karyotype.bmp"
    model_path = "../model/default_inference.h5"
    Pipeline.run(image_file=image_file,
                 save_dir=save_dir,
                 model_path=model_path)
示例#2
0
 def test_detect_interesting_points(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = image_utils.read_image(image_file)
     chromosomes = Pipeline.extract_chromosomes(image)
     straightened_chromosomes = Pipeline.straighten_chromosomes(chromosomes)
     _ = Pipeline.detect_interesting_points(straightened_chromosomes,
                                            verbose=True)
示例#3
0
def test_shell_command_exit():
    """Test that a single shell command functions properly
    """
    actions = [TaskAction("shell_command", name="exiter", commands=["exit 1"])]
    executor = Pipeline(actions)
    source = DummySource()

    result = executor.schedule(source).get()
    assert result.results["exiter"].returncode == 1
示例#4
0
 def test_organize_chromosomes(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = Pipeline.read_image(image_file)
     chromosomes = Pipeline.extract_chromosomes(image)
     straightened_chromosomes = Pipeline.straighten_chromosomes(chromosomes)
     # interesting_points = Pipeline.detect_interesting_points(straightened_chromosomes)
     interesting_points = None
     classified_chromosomes = Pipeline.classify_chromosomes(
         straightened_chromosomes, interesting_points)
     karyotyping_image = Pipeline.organize_chromosomes(
         classified_chromosomes)
     image_utils.show_image(karyotyping_image)
示例#5
0
def test_pipeline_produced_expected_data() -> bool:
    delete_existing_outputs(STORAGE_CONFIG)

    filename = os.path.basename(EXPECTED_FILE)
    pipeline = Pipeline(PIPELINE_CONFIG, STORAGE_CONFIG)
    pipeline.run(EXAMPLE_FILE)

    # Retrieve the output data file
    loc_id = pipeline.config.pipeline_definition.location_id
    datastream = DSUtil.get_datastream_name(config=pipeline.config)
    root: str = pipeline.storage._root
    output_file = os.path.join(root, loc_id, datastream, filename)

    # Assert that the basename of the processed file and expected file match
    assert os.path.isfile(output_file)

    # Compare data and optionally attributes to ensure everything matches.
    ds_out: xr.Dataset = xr.open_dataset(output_file)
    ds_exp: xr.Dataset = xr.open_dataset(EXPECTED_FILE)

    xr.testing.assert_allclose(ds_out, ds_exp)
示例#6
0
def main():
    inputs = {
        'channel_id': CHANNEL_ID,
        'search_word': 'incredible',
        'limit': 20,

    }
    steps = [
        Preflight(),
        GetVideoList(),  # 写成多行,增加易读性(最后一个建议有,)
        InitializeYT(),
        DownloadCaptions(),
        ReadCaption(),
        Search(),
        DownloadVideos(),
        EditVideo(),
        Postflight(),
    ]

    utils = Utils()
    p = Pipeline(steps)
    p.run(inputs, utils)
示例#7
0
def test_source_acquired():
    """Test that a single shell command is executed.
    This will acquire a source that installs flake8-diff as part
    of it's acquisition instructions, and then verify that
    flake8-diff is installed in the task itself.
    """
    actions = [
        TaskAction(
            "shell_command",
            name="installer",
            # workspace= 'python3',
            # workspace_kwargs= {
            #     "delete": False
            # },
            commands=["pip freeze |grep flake8-diff"],
        )
    ]
    executor = Pipeline(actions)
    source = DummySource()

    result = executor.schedule(source).get()
    assert result.results["installer"].returncode == 0
示例#8
0
 def test_straighten_chromosomes(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = image_utils.read_image(image_file)
     chromosomes = Pipeline.extract_chromosomes(image)
     _ = Pipeline.straighten_chromosomes(chromosomes, debug=True)
from pipeline.pipeline import Pipeline
# from interface.interface import Interface

# Pipeline.getInstance().run_pipeline(".", img_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_05_29/DFW_Early_190529_transformed_small.png",hmap_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_05_29/DFW_Early_190529Height_Map_trans.png")
# Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Early_19/19_06_05")
# Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Mid_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Mid_19/19_05_29")
Pipeline.getInstance().run_pipeline(".", parent_dir="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Late_19", seg_path="/Users/bauera/work/airsurf/wheat/DFW_images/DFW_Late_19/19_05_29")
# Interface.getInstance().run()
示例#10
0
    args = parse_args()
    prepare_libraries(args)
    settings = Settings()
    settings.update(args)
    # Set up logging
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    log_file = settings.get("log_file")
    if not log_file or log_file == "NONE":
        handler = logging.StreamHandler(sys.stdout)
    else:
        handler = logging.FileHandler(settings.get("log_file"))
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    settings.set("logger", logger)
    # Print out the settings
    logger.info("**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**--**")
    logger.info("Settings used for this run of ScaffMatch are:")
    for s, v in settings.iteritems():
        if s in ["std_dev", "ins_size", "pair_mode"]:
            continue
        logger.info("    %s  -- %s" % (s, v)) 
    # Feed the settings to the scaffolder pipeline
    scaffolder = Pipeline()
    scaffolder.set_settings(settings)
    # Go!
    scaffolder.scaffold()
    logger.info("Done!")
示例#11
0
def main():
    config = Config()
    parser = argparse.ArgumentParser(
        description='Code for building the Gutenberg Dialog Dataset')
    parser.add_argument('-dg',
                        '--dialog_gap',
                        default=config.dialog_gap,
                        help='Min. number of characters between two dialogs ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-isn',
        '--include_surrounding_narratives',
        default=config.include_surrounding_narratives,
        help='Whether to include surrounding narratives in the output dataset',
        action='store_true')
    parser.add_argument('-mnl',
                        '--max_narrative_length',
                        default=config.max_narrative_length,
                        help='Max. number of words in 1 narrative ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument(
        '-minl',
        '--min_intermediate_narrative_length',
        default=config.min_intermediate_narrative_length,
        help=
        'Min. number of words in 1 intermediate narrative (a narrative which occurs in-line with dialog) '
        + '(default: %(default)s)',
        metavar='',
        type=int)
    parser.add_argument('-mul',
                        '--max_utterance_length',
                        default=config.max_utterance_length,
                        help='Max. number of words in 1 utterance ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mb',
                        '--max_books',
                        default=config.max_books,
                        help='Limit the number of books in final dataset ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-md',
                        '--min_delimiters',
                        default=config.min_delimiters,
                        help='Min delimiters / 10000 words needed in a book ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-mdd',
                        '--min_double_delim',
                        default=config.min_double_delim,
                        help='Double delimiter threshold (romance languages ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-kl',
                        '--kl_threshold',
                        default=config.kl_threshold,
                        help='KL divergence threshold for filtering books ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-st',
                        '--size_threshold',
                        default=config.size_threshold,
                        help='#words threshold for filtering with KL' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-cd',
                        '--clean_dialogs',
                        default=config.clean_dialogs,
                        help='Whether to run pre-processing on dialogs',
                        action='store_true')
    parser.add_argument('-vt',
                        '--vocab_threshold',
                        default=config.vocab_threshold,
                        help='Ratio of unknown words allowed in a dialog ' +
                        '(default: %(default)s)',
                        metavar='',
                        type=int)
    parser.add_argument('-l',
                        '--languages',
                        default=config.languages,
                        help='Comma separated language codes ' +
                        'for which to build datasets',
                        metavar='',
                        type=str)
    parser.add_argument('-d',
                        '--download',
                        default=config.download,
                        help='Whether to run download step',
                        action='store_true')
    parser.add_argument('-f1',
                        '--pre_filter',
                        default=config.pre_filter,
                        help='Whether to run pre-filter step',
                        action='store_true')
    parser.add_argument('-e',
                        '--extract',
                        default=config.extract,
                        help='Whether to run extracting step',
                        action='store_true')
    parser.add_argument('-f2',
                        '--post_filter',
                        default=config.post_filter,
                        help='Whether to run post filter step',
                        action='store_true')
    parser.add_argument('-c',
                        '--create_dataset',
                        default=config.create_dataset,
                        help='Whether to run create dataset step',
                        action='store_true')
    parser.add_argument('-a',
                        '--run_all',
                        default=config.run_all,
                        help='Whether to run all steps',
                        action='store_true')
    parser.add_argument('-dir',
                        '--directory',
                        default=config.directory,
                        help='Directory where the language folders are',
                        metavar='',
                        type=str)

    parser.parse_args(namespace=config)
    p = Pipeline(config)
    p.run()
示例#12
0
sys.path.append(repo_path +
                '/logparser/logparser/LenMa/')  #for lenma __init__.py
sys.path.append(repo_path +
                '/logparser/logparser/LenMa/templateminer')  #for lenma
from pipeline.pipeline import Pipeline

input_dir = repo_path + '/'  # The input directory of log file
output_dir = repo_path + '/'  # The output directory of parsing results
log_file = 'dayco_log.log'  # The input log file name
log_format = '<smonth> <sday> <shour> <ip> <id> <id2> <month> <day> <hour> <city> <type> <Content>'  #dayco/rsyslog

pipeline = Pipeline(parser_algorithm='drain',
                    input_dir=input_dir,
                    parser_output_dir=output_dir,
                    log_file=log_file,
                    parser_regex=log_format,
                    feature_extractor='fixed_window',
                    log_analizer_algorithm='mining_invariants',
                    data_type='time_based',
                    elasticsearch_index_name='deepia')

para = {
    'path': repo_path + '/',  # directory for input data
    'log_file_name': 'dayco_log.log',  # filename for log data file
    'log_event_mapping':
    'dayco_log.logTemplateMap.csv',  # filename for log-event mapping. A list of event index, where each row represents a log
    'save_path':
    './time_windows/',  # dir for saving sliding window data files to avoid splitting
    #'select_column':[0,4],                      # select the corresponding columns (label and time) in the raw log file
    'select_column': [
        0, 1, 2
import json
import io
import csv
import string
from datetime import datetime

from pipeline.pipeline import Pipeline
from pipeline.csv_helper import CsvHelper

exclude_words = ('the', 'to', 'a', 'of', 'for', 'in', 'and', 'is', '–', 'on',
                 'hn:', 'an', 'at', 'not', 'with', 'why', 'how', 'your',
                 'from', 'new', 'you', 'i', 'by', 'what', 'my', 'are', 'as',
                 'that', 'we', 'it', 'be', 'now', 'using', 'has')

pipeline = Pipeline()
csv_helper = CsvHelper()


@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as file:
        data_dict = json.load(file)
        stories = data_dict['stories']
    return stories


@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    def is_popular(story):
        return story['points'] > 50 and story[
            'num_comments'] > 1 and not story['title'].startswith('Ask HN')
示例#14
0
"""
    Script for running the pipeline
"""
#pylint: disable-all
import os, sys, inspect

CURRENT_DIR = os.path.dirname(
    os.path.abspath(inspect.getfile(inspect.currentframe())))
PARENT_DIR = os.path.dirname(CURRENT_DIR)
sys.path.insert(0, PARENT_DIR)
from pipeline.pipeline import Pipeline

# init filterbank filename
fil_name = os.path.abspath("./pspm32.fil")
# init filterbank sample size
sample_size = 192
# init times the pipeline should run
n_times = 10

# run the filterbank n times
for i in range(n_times):
    # read static
    Pipeline(filename=fil_name, size=sample_size)
    # read stream, row per row
    Pipeline(filename=fil_name, as_stream=True)
    # read stream, n rows
    Pipeline(filename=fil_name, as_stream=True, n=sample_size)
    def make_pipeline(self):
        pipe = Pipeline('my_pipeline')
        pipe.add_factor('returns', Returns(window_length=150))

        return pipe
示例#16
0
 def __init__(
     self, pipeline_name="default", input_file="input.mp4", runtime_config=None
 ):
     update_preset_pipelines(input_file=input_file, runtime_config=runtime_config)
     Pipeline.__init__(self, preset_pipelines[pipeline_name])
示例#17
0
plugin_base = PluginBase(package='pipeline.modules')
modules = plugin_base.make_plugin_source(searchpath=[
    './pipeline/modules',
])


def setup_workspace():
    os.makedirs(workspace_location, exist_ok=True)
    os.makedirs(log_location, exist_ok=True)
    os.makedirs(output_location, exist_ok=True)


def fake_pipeline():
    open(pipeline_file, "w+").writelines(open(".pipeline").readlines())


def setup_docker():
    import docker
    return docker.from_env()


def test():
    setup_workspace()
    fake_pipeline()


if __name__ == "__main__":
    test()
    docker_client = setup_docker()
    pipeline = Pipeline(pipeline_file, docker_client, modules)
示例#18
0
import pickle

import pandas as pd

from pipeline.pipeline import Pipeline

# read datafame
df = pd.read_pickle("./datasets/h1b_2019.pkl")

# load into pipeline
pl = Pipeline()
pl.load_data(df)
pl.train_test_split("CASE_STATUS")  # TODO: redo to set_target

















示例#19
0
from pipeline.add_timestamp import AddTimestamp
from pipeline.write_json import WriteJSON
from pipeline.write_sitemaps import WriteSitemaps

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('SourceDirectory',
                        type=dir_path,
                        help="location of the vcpkg folder")
    parser.add_argument('-o',
                        type=dir_path,
                        help="output of the JSON file generated",
                        default="./")

    args = parser.parse_args()

    ports_path = os.path.join(args.SourceDirectory, "ports")
    triplets_path = os.path.join(args.SourceDirectory, "triplets")
    baseline_path = os.path.join(args.SourceDirectory,
                                 "scripts/ci.baseline.txt")
    version_path = os.path.join(args.SourceDirectory, "versions")
    data_out_path = os.path.join(args.o, "data")

    pipeline = Pipeline(ReadPackages(ports_path), AddUsage(ports_path),
                        AddTriplets(triplets_path), AddStatus(baseline_path),
                        AddVersion(version_path), AddTimestamp(),
                        WriteJSON(data_out_path, "libs.json"),
                        WriteSitemaps(args.o, "sitemap.txt"))

    pipeline.run()
示例#20
0
 def test_generate_chromosome_cluster(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = image_utils.read_image(image_file)
     image_utils.show_image(image)
     chromosome_cluster = Pipeline.generate_chromosome_cluster(image)
     image_utils.show_image(chromosome_cluster, cmap=None)
示例#21
0
 def test_read_image(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = Pipeline.read_image(image_file)
     image_utils.show_image(image)
示例#22
0
 def test_extract_chromosomes(self):
     image_file = data_dir + "/test/karyotype.bmp"
     image = image_utils.read_image(image_file)
     chromosomes = Pipeline.extract_chromosomes(image)
     for chromosome in chromosomes:
         image_utils.show_image(chromosome, cmap=None)
import io
from urllib import request
import csv
import psycopg2

from pipeline.pipeline import Pipeline

DATA_FILE_URL = 'https://dq-content.s3.amazonaws.com/251/storm_data.csv'

DB_HOST = 'localhost'
DB_NAME = ''  # set database name
DB_USER = ''  # set database user name
DB_PASSWORD = ''  # set database user password

pipeline = Pipeline()


@pipeline.task()
def create_db_connection():
    """Create database connection."""
    return psycopg2.connect(host=DB_HOST,
                            database=DB_NAME,
                            user=DB_USER,
                            password=DB_PASSWORD)


@pipeline.task(depends_on=create_db_connection)
def create_db_tables(db_conn):
    """Create database tables for staging and final data."""
    cursor = db_conn.cursor()
示例#24
0
import csv
import io
import json
import string
from collections import Counter
from datetime import datetime
from pprint import pprint
from pytz import timezone

from pipeline.pipeline import Pipeline, build_csv
from pipeline.stop_words import stop_words

pipeline = Pipeline()


def __get_start_end_dates(year):
    # Given a year, return the start end end timestamps in unix epoch
    utc = timezone("UTC")
    start = utc.localize(datetime(year, 1, 1)).timestamp()
    end = utc.localize(datetime(year + 1, 1, 1)).timestamp()

    return start, end


# TODO currently we are only getting 1 page, need iterate through pages to get full dataset
@pipeline.task()
def get_data_from_hacker_news(year=2014):
    import requests
    url = "http://hn.algolia.com/api/v1/search_by_date"
    start, end = __get_start_end_dates(year)
    query = {