Exemplo n.º 1
0
    def execute(self, context):
        """
        Picks up all files from a source directory and dumps them into a root directory system,
        organized by dagid, taskid and execution_date
        """
        execution_date = context['execution_date'].strftime(DATE_FORMAT)
        src_hook = FSHook(conn_id=self.src_conn_id)
        source_dir = src_hook.get_path()

        dest_hook = FSHook(conn_id=self.dst_conn_id)
        dest_root_dir = dest_hook.get_path()

        dag_id = self.dag.dag_id
        task_id = self.task_id

        logging.info("Now searching for files like {0} in {1}".format(self.file_mask, source_dir))
        file_names = fnmatch.filter(os.listdir(source_dir), self.file_mask)
        for file_name in file_names:
            full_path = os.path.join(source_dir, file_name)
            dest_dir = os.path.join(dest_root_dir, dag_id, task_id, execution_date)
            logging.info("Now creating path structure {0}".format(dest_dir))
            os.makedirs(dest_dir)
            dest_file_name = os.path.join(dest_dir, os.path.basename(file_name))
            logging.info("Now moving {0} to {1}".format(full_path, dest_file_name))
            copyfile(full_path, dest_file_name)
def uncompress_files(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    if isfile(fs.get_path() + '/hapmap.ped'):
        return True
    os.system('bzip2 -d {fs_path}/hapmap.map.bz2'.format(fs_path=fs.get_path()))
    os.system('bzip2 -d {fs_path}/hapmap.ped.bz2'.format(fs_path=fs.get_path()))
    return True
def plot_pca(ds, **kwargs):
    import matplotlib
    matplotlib.use('svg')
    import pandas as pd
    fs = FSHook('fs_bioinf')
    pca_df = pd.read_csv(fs.get_path() + '/pca.eigenvec', sep=' ', header=None)
    ax = pca_df.plot.scatter(x=2, y=3)
    ax.figure.savefig(fs.get_path() + '/pca.png')
def download_files(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    force = kwargs['params'].get('force', 'false') == 'true'
    with FTPHook('ftp_ncbi') as ftp:
        for ftp_name, local_name in ftp_files.items():
            local_path = fs.get_path() + '/' + local_name
            uncompressed_local_path = local_path[:-4]
            if (isfile(local_path) or isfile(uncompressed_local_path)) and not force:
                continue
            if not isfile(local_name):
                ftp.retrieve_file(ftp_directory + ftp_name, local_path)
    open(fs.get_path() + '/done.txt', 'wb')
    return True
Exemplo n.º 5
0
def print_file_content(**context):
    foldername = "/processed"
    hook = FSHook('local_file_system')
    parentPath = str(Path(hook.get_path()).parent)
    if not os.path.exists(parentPath + foldername):
        os.makedirs(parentPath + foldername)

    for file in os.listdir(hook.get_path()):
        if file.endswith(".txt"):
            with open(hook.get_path() + "/" + file, 'r') as fp:
                print(fp.read())
                shutil.move(hook.get_path() + "/" + file,
                            parentPath + foldername + "/" + file)
Exemplo n.º 6
0
def extract_t2m_wm(ds, *args, **kwargs):
    ds = ds['air_temperature_2m']
    # Extracted grid from arome metno files
    grid_dir = FSHook('grid_data')
    grid_file = os.path.join(grid_dir.get_path(), 'metno_grid')
    grid_builder = pymepps.GridBuilder(grid_file)

    ds.pp.grid = grid_builder.build_grid()

    # Extract the nearest point to the Wettermast
    pd_extracted = ds.pp.to_pandas((10.105139, 53.519917))

    # Transform to degree celsius
    pd_extracted -= 273.15
    pd_extracted.index -= pd_extracted.index[0]
    pd_extracted.columns = [
        kwargs['run_date'],
    ]
    logger.info('The extracted T2m is:\n{0}'.format(str(pd_extracted)))

    # Update the database
    try:
        loaded_data = pd.DataFrame.pp.load(kwargs['output_path'])
        loaded_data.index = pd.TimedeltaIndex(loaded_data.index.values)
        return loaded_data.pp.update(pd_extracted)
    except FileNotFoundError:
        return pd_extracted
Exemplo n.º 7
0
def hdfs_files_fs_copy(task_id, **kwargs):
    ti = kwargs['ti']
    task = 'init_task_{}'.format(task_id)
    xcom_values = pullXcom(ti, task, xcom_keys)  #pull data from xcom object
    # fs_file = xcom_values['fs_file']
    fs_filepath = xcom_values['fs_path']
    hdfs_path = xcom_values['hdfs_path']
    hdfs_path_month = xcom_values['hdfs_path_month']
    task = 'fetch_files_fs_{}'.format(task_id)
    fs_file = ti.xcom_pull(task_ids=task, key='fs_file')

    # print(task

    # fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path')
    # hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path')
    # hdfs_path_month = ti.xcom_pull(task_ids=task, key='hdfs_path_month')
    print("received message:{} {}".format(fs_file, fs_filepath))
    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    full_path = "/".join([basepath, fs_filepath])
    tmp_path = full_path + "/tmp/"
    # copying files to local tmp location for hdfs load.
    # if not os.path.exists(tmp_path):
    #     os.mkdir(tmp_path)
    # for file in fs_file.split(","):
    #     print(file
    #     if file != "":
    #         shutil.copy(full_path + "/" + file, tmp_path)
    # now copying files to hdfs
    # hdfs_webhook.load_file(tmp_path, hdfs_path)
    copyFileToHDFS(full_path + "/", hdfs_path_month)
Exemplo n.º 8
0
def print_file_content(**context):
    hook = FSHook('my_file_system2')
    base_path = hook.get_path()
    path = os.path.join(base_path, 'test.txt')
    with open(path, 'r') as fp:
        print(fp.read())
    os.remove(path)
Exemplo n.º 9
0
def slice_wettermast(ds, *args, **kwargs):
    logger.info(ds)
    grid_dir = FSHook('grid_data')
    grid_file = os.path.join(grid_dir.get_path(), 'metno_grid')
    grid_builder = pymepps.GridBuilder(grid_file)
    grid = grid_builder.build_grid()
    nn = grid.nearest_point((53.519917, 10.105139))
    logger.info('Select point: {0}'.format(nn))
    ds = ds.isel(y=nn[0], x=nn[1])
    return ds
Exemplo n.º 10
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     logging.info('Poking for file {full_path} '.format(**locals()))
     try:
         files = [f for f in walk(full_path)]
     except:
         return False
     return True
Exemplo n.º 11
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     self.log.info('Poking for file {full_path}'.format(**locals()))
     try:
         files = [f for f in walk(full_path)]
     except OSError:
         return False
     return True
Exemplo n.º 12
0
    def execute(self, context):
        """
        Picks up all files from a source directory and dumps them into a root directory system,
        organized by dagid, taskid and execution_date
        """
        execution_date = context['execution_date'].strftime(DATE_FORMAT)
        src_hook = FSHook(conn_id=self.src_conn_id)
        dest_hook = FSHook(conn_id=self.dst_conn_id)
        dest_dir = dest_hook.get_path()

        dag_id = self.dag.dag_id

        source_dir = os.path.join(src_hook.get_path(), dag_id, self.src_task_id, execution_date)
        if os.path.exists(source_dir):
            for file_name in os.listdir(source_dir):
                full_path = os.path.join(source_dir, file_name)
                dest_file_name = os.path.join(dest_hook.get_path(), file_name)
                logging.info("Now moving {0} to final destination {1}".format(full_path, dest_file_name))
                copyfile(full_path, dest_file_name)
Exemplo n.º 13
0
 def poke(self, context):
     hook = FSHook(self.conn_id)
     basepath = hook.get_path()
     full_path = os.path.join(basepath, self.dir_path)
     self.log.info('poking location %s', full_path)
     try:
         for root, dirs, files in os.walk(full_path):
             if len(files) >= 5:
                 return True
     except OSError:
         return False
     return False
Exemplo n.º 14
0
    def poke(self, context):
        hook = FSHook(self.fs_conn_id)
        basepath = hook.get_path()
        full_path = os.path.join(basepath, self.filepath)
        self.log.info('Poking for file %s', full_path)

        for path in glob(full_path):
            if os.path.isfile(path):
                return True

            for _, _, files in os.walk(full_path):
                if len(files) > 0:
                    return True
        return False
Exemplo n.º 15
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = os.path.join(basepath, self.filepath)
     self.log.info('Poking for file %s', full_path)
     try:
         if stat.S_ISDIR(os.stat(full_path).st_mode):
             for root, dirs, files in os.walk(full_path):
                 if len(files):
                     return True
         else:
             # full_path was a file directly
             return True
     except OSError:
         return False
     return False
Exemplo n.º 16
0
 def poke(self, context):
     hook = FSHook(self.fs_conn_id)
     basepath = hook.get_path()
     full_path = "/".join([basepath, self.filepath])
     self.log.info('Poking for file {full_path}'.format(**locals()))
     try:
         if stat.S_ISDIR(os.stat(full_path).st_mode):
             for root, dirs, files in os.walk(full_path):
                 if len(files):
                     return True
         else:
             # full_path was a file directly
             return True
     except OSError:
         return False
     return False
Exemplo n.º 17
0
def fetch_files_fs(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    ti = kwargs['ti']
    xcom_values = pullXcom(ti, task, xcom_keys)  #pull data from xcom object
    fs_filepath = xcom_values['fs_path']
    fs_pattern = xcom_values['fs_pattern']
    task_instance = kwargs['task_instance']
    # print('for fs hook'
    # fs_filepath = kwargs.get('templates_dict').get('fs_path', None)
    # fs_pattern = kwargs.get('templates_dict').get('fs_pattern', None)
    # hdfs_path = kwargs.get('templates_dict').get('hdfs_path', None)
    # hdfs_path_month = kwargs.get('templates_dict').get('hdfs_path_month', None)
    print('file path ' + fs_filepath)
    print('file pattern ' + fs_pattern)
    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    hdfs_file = ""
    full_path = "/".join([basepath, fs_filepath])
    print(full_path)
    try:
        if stat.S_ISDIR(os.stat(full_path).st_mode):
            for root, dirs, files in os.walk(full_path):
                for my_file in files:
                    if not my_file.__contains__(fs_pattern):
                        print('files to be copied to hdfs {}'.format(my_file))
                        # adding files to tha csv string
                        hdfs_file += my_file + ","
                    else:
                        print('files {}'.format(my_file))
            print('files copied to hdfs {}'.format(hdfs_file))
            # ti.xcom_push(key="fs_file", value=hdfs_file)
            # xcom_values = pullXcom(ti, task,xcom_keys)
            # print(xcom_values
            task_instance.xcom_push(key="fs_file", value=hdfs_file)
            # task_instance.xcom_push(key="fs_path", value=fs_filepath)
            # task_instance.xcom_push(key="hdfs_path", value=hdfs_path)
            # task_instance.xcom_push(key="hdfs_path_month", value=hdfs_path_month)
            return True
        else:
            # full_path was a file directly
            return True

    except OSError:
        return False
    return False
Exemplo n.º 18
0
def process_source_data():
    fileHook = FSHook('fs_custom')
    mongoHook = MongoHook()
    path = os.path.join(fileHook.get_path(), 'daily_production_data.json')

    df = pd.read_json(path)
    water_cut_calc = []
    gor_calc = []

    for index, row in df.iterrows():
        water_cut_calc.append(
            utils.calc_watercut(row['OIL_bopd'], row['WATER_bwpd']))
        gor_calc.append(utils.calc_gor(row['OIL_bopd'], row['GAS_mscfd']))

    df = df.assign(**{'water_cut_calc': water_cut_calc, 'gor_calc': gor_calc})

    data_dict = df.to_dict("records")
    mongoHook.insert_many('DailyProduction', data_dict, 'fusion_dev_db')

    os.remove(path)
Exemplo n.º 19
0
    def poke(self, context):
        hook = FSHook(self.fs_conn_id)
        basepath = hook.get_path()
        full_path = os.path.join(basepath, self.filepath)
        self.log.info('Poking for file %s', full_path)

        valid_files = []
        for path in glob(full_path):
            if os.path.isfile(path):
                valid_files.append(path)
        if valid_files:
            valid_files.sort()
            self.log.info(
                f'The full list of valid files is: ({", ".join(valid_files)})')
            relative_path = os.path.relpath(valid_files[0],
                                            start=self.base_path)
            self.log.info(
                f'Relative path of the earliest file is: {relative_path}')
            context['ti'].xcom_push(key='return_value',
                                    value=relative_path,
                                    execution_date=context['execution_date'])
            return True
        else:
            return False
Exemplo n.º 20
0
def hdfs_files_fs_cleanup(task_id, **kwargs):
    ti = kwargs['ti']
    task = 'hdfs_file_cleanup_task_{}'.format(task_id)
    fs_file = ti.xcom_pull(task_ids=task, key='fs_file')
    fs_filepath = ti.xcom_pull(task_ids=task, key='fs_path')
    hdfs_path = ti.xcom_pull(task_ids=task, key='hdfs_path')

    deleteFileFromHDFS(hdfs_path)

    fs_hook = FSHook("fs_default")
    basepath = fs_hook.get_path()
    full_path = fs_filepath

    print(full_path)
    print(fs_filepath)

    for file in fs_file.split(","):
        print(file)
        if file != "":
            src = full_path + "/" + file
            dest = full_path + "/" + file + ".processed"
            print("moving file {} --> {}".format(src, dest))
            shutil.move(src, dest)
    print('Clean up success')
Exemplo n.º 21
0
    server_static_path='http://thredds.met.no/thredds/dodsC/meps25files',
    server_template='meps_det_extracted_2_5km_%Y%m%dT%HZ.nc',
    dt_rounding=datetime.timedelta(hours=6),
    dt_offset=None,
    task_id='sensor_realtime',
    timeout=60 * 60 * 6,
    poke_interval=60,
    pool='sensor_pool',
    dag=dag)

rt_dl_t2m = XarrayOperator(
    python_callable=dataset_slice_data,
    input_static_path='http://thredds.met.no/thredds/dodsC/meps25files',
    input_template='meps_det_extracted_2_5km_%Y%m%dT%HZ.nc',
    rounding_td=datetime.timedelta(hours=6),
    output_static_path=METNO_DET_HOOK.get_path(),
    output_template='%Y%m%d_%H%M/t2m.nc',
    op_kwargs=dict(variables='air_temperature_2m',
                   isel=dict(y=slice(0, 90), x=slice(210, 270))),
    provide_context=True,
    task_id='realtime_t2m_download',
    trigger_rule=TriggerRule.ALL_SUCCESS,
    dag=dag)
rt_dl_t2m.set_upstream(rt_sensor)

rt_dl_extracted = XarrayOperator(
    python_callable=dataset_slice_data,
    input_static_path='http://thredds.met.no/thredds/dodsC/meps25files',
    input_template='meps_det_extracted_2_5km_%Y%m%dT%HZ.nc',
    rounding_td=datetime.timedelta(hours=6),
    output_static_path=METNO_DET_HOOK.get_path(),
Exemplo n.º 22
0
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
}

DATA_HOOK = FSHook('wettermast_data')
DB_HOOK = FSHook('db_data')
TMP_HOOK = FSHook('temporary_data')

dag = DAG('extract_wettermast_v0.2',
          default_args=default_args,
          schedule_interval=datetime.timedelta(minutes=15),
          orientation='TB')

wm_sensor_task = FTPSensor(filename_template='%G_W%V_MASTER_M10.txt',
                           ftp_conn_id='ftp_wettermast',
                           disk_path=DATA_HOOK.get_path(),
                           task_id='sensor_ftp',
                           timeout=120,
                           poke_interval=10,
                           pool='sensor_pool',
                           dag=dag)

dl_task = FTPDownloader(filename_template='%G_W%V_MASTER_M10.txt',
                        ftp_conn_id='ftp_wettermast',
                        disk_path=DATA_HOOK.get_path(),
                        task_id='downloader_ftp',
                        trigger_rule=TriggerRule.ALL_SUCCESS,
                        dag=dag)

dl_task.set_upstream(wm_sensor_task)
Exemplo n.º 23
0
def print_file_content(**context):
    hook = FSHook('my_tmp_file_path')
    path = os.path.join(hook.get_path(), 'test.txt')
    with open(path, 'r') as fp:
        print(fp.read())
def compute_pca(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    os.system('/home/tra/anaconda3/bin/plink --pca --file {fs_path}/hapmap1 -out {fs_path}/pca'.format(fs_path=fs.get_path()))
    return True
def subsample_1p(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    os.system('/home/tra/anaconda3/bin/plink --recode --file {fs_path}/hapmap --noweb --out {fs_path}/hapmap1 --thin 0.01 --geno 0.1'.format(fs_path=fs.get_path()))
    return True