def read_compute_write(store_id, file_date_format): # Read sales for the day sales_file_name = '{}/sales_store{}_{}.csv'.format(raw_data_folder_sales, store_id, file_date_format) if not adl.exists(sales_file_name): return sales_f = read_file(sales_file_name, 'TransactionDateTime') if sales_f.empty: return # Read configuration file to get the policy directory info multithread.ADLDownloader(adl, lpath='.\Configurations.xlsx', rpath=configuration_folder + '/Configurations.xlsx', overwrite=True) conf = pd.read_excel('.\Configurations.xlsx', sheetname='InventoryPolicyConfig') # Read partial orders orders = read_partial_orders(store_id) # Compute metric metrics_df, partial_orders_master, sales_orders = compute_metric(sales_f, orders) # change orders_to_date to orders when in real time mode #write_file(sales_orders, '{}/sales_orders{}_{}.csv'.format(raw_data_folder_orders, store_id, file_date_format)) # Write partial orders for next day with unsold quantities for each order in history policies = partial_orders_master['PolicyID'].unique() for policy_id in policies: directory_name = conf.loc[conf['InventoryPolicyName'] == policy_id,'DirectoryName'].iat[0] partial_orders_file_name = '{}/{}/partial_orders_{}.csv'.format(raw_data_folder_orders, directory_name, store_id) partial_orders_policy = partial_orders_master.loc[partial_orders_master['PolicyID'] == policy_id].copy() partial_orders_policy['Quantity'] = partial_orders_policy['Quantity'].astype(int) partial_orders_policy['ConfidenceInterval'] = partial_orders_policy['ConfidenceInterval'].astype(int) partial_orders_policy.sort_index(inplace=True) write_file(partial_orders_policy, partial_orders_file_name) return metrics_df, sales_orders
def AzCopy(adl, source, target): 'Moving file from source-location to target-location' multithread.ADLDownloader(adl, rpath=source, lpath=target) log.debug("%sSource: %s", LEV2, source) log.debug("%sTarget: %s", LEV2, target) return
def add_usql_job(scripts_folder, directory_name, usql_file, adl_token, adl_name, simulation_datetime, au_per_usql_job): #may need to recreate adl_token every time in case it expires #adl_token = lib.auth(tenant_id=adl_tenant_id, client_id=adl_client_id, client_secret=adl_client_secret) adla_job_client = DataLakeAnalyticsJobManagementClient( adl_token, 'azuredatalakeanalytics.net') # download USQL file from ADLS usql_file_full_path = scripts_folder + '/' + directory_name + '/' + usql_file + '.usql' adls_file_system_client = core.AzureDLFileSystem(adl_token, store_name=adl_name) multithread.ADLDownloader(adls_file_system_client, lpath='.', rpath=usql_file_full_path, overwrite=True) usql_script = ''.join(open(usql_file + '.usql', 'r').readlines()) if simulation_datetime: datetime_replace = "Convert.ToDateTime(\"" + simulation_datetime + "\")" usql_script = usql_script.replace('DateTime.Now', datetime_replace) jobId = str(uuid.uuid4()) jobInfo = JobInformation(name=directory_name + '/' + usql_file, type='USql', degree_of_parallelism=au_per_usql_job, properties=USqlJobProperties(script=usql_script)) jobResult = adla_job_client.job.create(adl_name, jobId, jobInfo) return (jobId)
def download(): print("Downloading previous day's information from the Azure data lake") remote_file = remote_path + y1 + '.csv' local_file = y1 + '.csv' ## Download a file multithread.ADLDownloader(adlsFileSystemClient, lpath=local_file, rpath=remote_file, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) print("Download completed\n")
def download_folder(self, source_folder_path, dest_folder_path): if not os.path.exists(dest_folder_path): os.makedirs(dest_folder_path) multithread.ADLDownloader(self.adl_conn_obj, lpath=dest_folder_path, rpath=source_folder_path, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
def downloadfile_ToADS(self,inpath: str,outpath: str): # ## Download a file adlsFileSystemClient = self._create_filesytem_conn() multithread.ADLDownloader(adlsFileSystemClient, lpath=inpath, rpath=outpath, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
def download(download_dir, data_dir): token = lib.auth() adl = core.AzureDLFileSystem(token, store_name='bigdatadevdatalake') download_dir = "december_2018" for f in adl.ls(data_dir): print(f[-38:]) outfile = os.path.join(download_dir, f[-38:]) downloader = multithread.ADLDownloader(adl, f, outfile) if downloader.successful(): print("Finished Downloading!") else: print("error in downloading!")
def download_file( self, local_path: str, remote_path: str, nthreads: int = 64, overwrite: bool = True, buffersize: int = 4194304, blocksize: int = 4194304, **kwargs, ) -> Any: """ Download a file from Azure Blob Storage. :param local_path: local path. If downloading a single file, will write to this specific file, unless it is an existing directory, in which case a file is created within it. If downloading multiple files, this is the root directory to write within. Will create directories as required. :type local_path: str :param remote_path: remote path/globstring to use to find remote files. Recursive glob patterns using `**` are not supported. :type remote_path: str :param nthreads: Number of threads to use. If None, uses the number of cores. :type nthreads: int :param overwrite: Whether to forcibly overwrite existing files/directories. If False and remote path is a directory, will quit regardless if any files would be overwritten or not. If True, only matching filenames are actually overwritten. :type overwrite: bool :param buffersize: int [2**22] Number of bytes for internal buffer. This block cannot be bigger than a chunk and cannot be smaller than a block. :type buffersize: int :param blocksize: int [2**22] Number of bytes for a block. Within each chunk, we write a smaller block for each API call. This block cannot be bigger than a chunk. :type blocksize: int """ multithread.ADLDownloader( self.get_conn(), lpath=local_path, rpath=remote_path, nthreads=nthreads, overwrite=overwrite, buffersize=buffersize, blocksize=blocksize, **kwargs, )
def download_from_adls(adl, short_filename, filename): download_succeeded = False for i in arange(n_download_retries): try: multithread.ADLDownloader(adl, lpath=short_filename, rpath=filename, overwrite=True) except BaseException as e: logger.error('Failed to download the file ' + short_filename + ': ' + str(e), exc_info=True) time.sleep(30) continue download_succeeded = True break return download_succeeded
def upload_download(adl, diff_list): for element in sorted( [element for element in diff_list if "UPLOAD" in element["action"]], key=lambda x: str(x["type"] + x["name"])): print(element["name"]) multithread.ADLUploader(adl, rpath=element["name"], lpath="./" + element["name"], nthreads=64, overwrite=True, buffersize=4194034, blocksize=4194304) for element in sorted( [element for element in diff_list if "DOWNLOAD" in element["action"]], key=lambda x: str(x["type"] + x["name"])): print(element["name"]) multithread.ADLDownloader(adl, rpath=element["name"], lpath="./" + element["name"], nthreads=64, overwrite=True, buffersize=4194034, blocksize=4194304)
_CLIENT_ID = os.environ['CLIENT_ID'] _CLIENT_SECRET = os.environ['CLIENT_SECRET'] #Web App credentials _WEB_APP_NAME = os.environ['FUNCTIONS_APP_NAME'] _WEB_APP_USER = os.environ['FUNCTIONS_APP_USER'] _WEB_APP_PASSWORD = os.environ['FUNCTIONS_APP_PASSWORD'] #Pull the last simulation datetime from ADLS and decide the current simulation datetime token = lib.auth(tenant_id=_TENANT_ID, client_id=_CLIENT_ID, client_secret=_CLIENT_SECRET) adl = core.AzureDLFileSystem(token=token, store_name=_ADL_NAME) multithread.ADLDownloader(adl, lpath='LastSimulationDatetime.txt', rpath='/webjob_log/LastSimulationDatetime.txt', overwrite=True) f = open('LastSimulationDatetime.txt', 'r') simulation_datetime_last_str = f.readlines()[0] f.close() print('Last simulation time:' + simulation_datetime_last_str) simulation_datetime_last = datetime.datetime.strptime( simulation_datetime_last_str, '%m/%d/%Y %H:%M:%S') simulation_datetime_cur = simulation_datetime_last + datetime.timedelta(days=1) simulation_datetime_cur_str = datetime.datetime.strftime( simulation_datetime_cur, '%m/%d/%Y %H:%M:%S') print('Current simulation time:' + simulation_datetime_cur_str)
scripts_adl_dir = args.scripts_adl_dir policy_adl_subdir = args.policy_adl_subdir policy_script = args.policy_script adl_name = args.adl_name tenant_id = args.adl_tenant_id client_id = args.adl_client_id client_secret = args.adl_client_secret token = lib.auth(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret) adls_file_system_client = core.AzureDLFileSystem(token, store_name=adl_name) task_script = scripts_adl_dir + '/inventory_optimization_task.py' upload_script = scripts_adl_dir + '/upload_to_adls.py' mipcl_script = scripts_adl_dir + '/mipcl_wrapper.py' policy_script = scripts_adl_dir + '/' + policy_adl_subdir + '/' + policy_script file_list = [task_script, upload_script, mipcl_script, policy_script] scripts_local_path = '/taskscripts/' for file in file_list: multithread.ADLDownloader(adls_file_system_client, lpath=scripts_local_path, rpath=file, overwrite=True)
return args def client(args): """Create a filesystem client object Parameters: args (class): Arguments. """ adls_client = core.AzureDLFileSystem(store_name=args.account_name) return adls_client if __name__ == "__main__": args = parse() adls_client = client(args) print("Downloading content from ADLS account: {}".format( args.account_name)) print("Downloading {0} into {1}...".format(args.adls_folder, args.local_folder)) threads = multiprocessing.cpu_count() with Timer() as t: multithread.ADLDownloader(adls_client, lpath=args.local_folder, rpath=args.adls_folder, nthreads=threads, overwrite=True, buffersize=4194304, blocksize=4194304, verbose=True) print("Process time {}s".format(t.interval))
from azure.datalake.store import core, lib, multithread token = lib.auth(tenant_id, username, password) adl = core.AzureDLFileSystem(token, store_name=store_name) # typical operations adl.ls('') adl.ls('tmp/', detail=True) adl.ls('tmp/', detail=True, invalidate_cache=True) adl.cat('samplefile') adl.head('example.csv') # file-like object with adl.open('example.csv', blocksize=2**20) as f: print(f.readline()) print(f.readline()) print(f.readline()) # could have passed f to any function requiring a file object: # pandas.read_csv(f) with adl.open('far_and_beyond', 'wb') as f: # data is written on flush/close, or when buffer is bigger than # blocksize f.write(b'important data') adl.du('far_and_beyond') multithread.ADLDownloader(adl, "", 'tmp/', 5, 2**24)
token = lib.auth() # Create an ADLS File System Client. The store_name is the name of your ADLS account adlsFileSystemClient = core.AzureDLFileSystem(token, store_name='wesaprod0adlstore') # Create a directory in ADLS adlsFileSystemClient.mkdir('/testDirectoryPython') # Upload file to created directory multithread.ADLUploader( adlsFileSystemClient, lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv', rpath='/testDirectoryPython/data.csv', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) # Download file from created directory multithread.ADLDownloader( adlsFileSystemClient, lpath='C:\\Users\\aznaik\\Desktop\\PythonADL\\data.csv', rpath='/testDirectoryPython/data.csv', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) # Delete directory (removes sub-directories/file recursively) adlsFileSystemClient.rm('/testDirectoryPython', recursive=True)
context = adal.AuthenticationContext(authority_uri, api_version=None) mgmt_token = context.acquire_token_with_client_credentials( resource_uri, client_id, client_secret) credentials = AADTokenCredentials(mgmt_token, client_id) token = lib.auth(tenant_id='b5da5f35-6442-4f5a-9622-92ec6a535127', client_secret='SKzEkkDO0uCs08A1MLIovNKFCKclR7f5xn86/+jU1zQ=', client_id='7d960b27-c1a5-4424-93bf-e4565df8ac39') adlsFileSystemClient = core.AzureDLFileSystem(token, store_name=adlsAccountName) print("Downloading config file from Azure Data Lake") multithread.ADLDownloader(adlsFileSystemClient, lpath=local_config, rpath=remote_config, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) print("Download completed\n") import auth_token import org_gateway_info import previous_day import generate_file import upload_files if __name__ == "__main__": token = auth_token.authorization_token() org_gateway_info.get_org_children(token) org_gateway_info.clean()
pool.join() e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Total process time: ", total_time) if __name__ == '__main__': token = lib.auth() adl = core.AzureDLFileSystem(token, store_name = 'bigdatadevdatalake') downloaded = getDownloaded() print("Downloaded files: ", downloaded) for f in adl.ls(data_dir): if f in downloaded: continue s = time.time() print("Processing file {}".format(f[-38:])) outfile = os.path.join(download_dir, f[-38:]) downloader = multithread.ADLDownloader(adl, f, outfile) if downloader.successful(): e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Finished downloading: ", total_time) main(outfile, f) os.remove(outfile) e = time.time() total_time = time.strftime("%H:%M:%S", time.gmtime(e-s)) print("Total processing time: ", total_time) with open("downloaded.txt", "a") as myfile: myfile.write(f) else: print("error in downloading!") # main('../downloaded/IWantTransactionFactTable-20181201.csv', 'ProdDataHub/TransactionFactTable/IWant/2018/12/IWantTransactionFactTable-20181201.csv') # for f in os.listdir("../10"):
def main(): """ Entry point :return: """ rf = Config.LOCAL_WORKING_DIR adl_path = Config.ADLS_PATH adl_client = get_adl_client(Config.ADLS_ACCOUNT_NAME, Config.ADLS_TENANT_ID) adl_folders = adl_client.ls(adl_path) # For training_file in training_files: print(adl_folders) # Loop through vendors download images and convert to pdf for adl_vendor_path in adl_folders: print(f"Processing vendor {adl_vendor_path}") if len(Config.RUN_FOR_SINGLE_ISSUER) > 0: if Config.RUN_FOR_SINGLE_ISSUER not in adl_vendor_path: continue vendor_folder = os.path.split(adl_vendor_path)[-1] vendor_folder_path = f"{rf}/{vendor_folder}" if not os.path.exists(vendor_folder_path): print(f"Creating folder {vendor_folder_path}") os.mkdir(vendor_folder_path) # Download all the files for a vendor # TODO we are using Azure Data Lake here change to appropriate multithread.ADLDownloader(adl_client, lpath=vendor_folder_path, rpath=adl_vendor_path, nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) tif_files = [ f for f in os.listdir(vendor_folder_path) if f.endswith('TIF') ] # Create the BlockBlockService that the system uses to call the Blob service for the storage account. block_blob_service = BlockBlobService( account_name=Config.STORAGE_ACCOUNT_NAME, account_key=Config.STORAGE_KEY) temp_folder = vendor_folder_path for file_name in tif_files: print(f"Create temp folder {temp_folder}") if not os.path.exists(temp_folder): os.mkdir(temp_folder) try: print(f"Processing {vendor_folder}") convert_tif_to_pdf_fpdf(vendor_folder_path, temp_folder, file_name, vendor_folder) except Exception as e: print(e) continue container_name = vendor_folder + Config.CONTAINER_SUFFIX + Config.TRAIN_TEST print(f"Uploading to blob {container_name}") # Create container if it doesn't exist and get container sas url _, _ = create_container(block_blob_service, Config.STORAGE_ACCOUNT_NAME, container_name) # Upload to container upload_blobs_to_container(block_blob_service, vendor_folder_path, container_name, '.pdf') print(f"Removing folder {vendor_folder_path}") shutil.rmtree(vendor_folder_path)
def bench_download_50_1gb(adl, lpath, rpath, config): return multithread.ADLDownloader( adl, lpath=lpath, rpath=rpath, **config[bench_download_50_1gb.__name__])
def read_fn(file_references, mode, params=None): """A custom python read function for interfacing with nii image files. Args: file_references (list): A list of lists containing file references, such as [['id_0', 'image_filename_0', target_value_0], ..., ['id_N', 'image_filename_N', target_value_N]]. mode (str): One of the tf.estimator.ModeKeys strings: TRAIN, EVAL or PREDICT. params (dict, optional): A dictionary to parametrise read_fn outputs (e.g. reader_params = {'n_examples': 10, 'example_size': [64, 64, 64], 'extract_examples': True}, etc.). Yields: dict: A dictionary of reader outputs for dltk.io.abstract_reader. """ print('Reading the dataset from Datalakestore (2mm NIfTI images)....') def _augment(img): """An image augmentation function""" return flip(img, axis=2) image_array = [] label_array = [] for f in file_references: subject_id = f[0] # Read the image nii with sitk ##t1_fn = os.path.join(data_path, '{}/T1_2mm.nii.gz'.format(subject_id)) ##t1 = sitk.GetArrayFromImage(sitk.ReadImage(str(t1_fn))) t1_fn = os.path.join(data_path, '{}/T1_2mm.nii.gz'.format(subject_id)) print(t1_fn) #with adlsFileSystemClient.open(t1_fn, 'rb') as f: # img = sitk.ReadImage(str(f)) # sitk::ERROR: The file "<ADL file: /clusters/DLTK_IXI_Dataset/2mm/IXI012/T1_2mm.nii.gz>" does not exist. # sitk seems only read from local path....how to read from remote path???????? # for short term download to local path # rpath is datalakestore, lpath is local file path both have the same root structure '/clusters/DLTK_IXI_Dataset/' multithread.ADLDownloader(adlsFileSystemClient, rpath=t1_fn, lpath=t1_fn, nthreads=5, chunksize=2**24, overwrite=True) img = sitk.ReadImage(str(t1_fn)) # you need http://imagej.net/Fiji#Downloads app to show the img. More discussion and instruction: https://stackoverflow.com/questions/45682319/simpleitk-show-generates-error-in-imagej-on-linux ##sitk.Show(img) t1 = sitk.GetArrayFromImage(img) # Normalise volume image t1 = whitening(t1) images = np.expand_dims(t1, axis=-1).astype(np.float32) if mode == tf.estimator.ModeKeys.PREDICT: yield {'features': {'x': images}, 'img_id': subject_id} print('read_fn Predict') # Parse the sex classes from the file_references [1,2] and shift them # to [0,1] sex = np.int(f[1]) - 1 y = np.expand_dims(sex, axis=-1).astype(np.int32) # Augment if used in training mode if mode == tf.estimator.ModeKeys.TRAIN: images = _augment(images) print('read_fn Train') # Check if the reader is supposed to return training examples or full images if params['extract_examples']: #print('read_fn params extract_examples') images = extract_random_example_array( image_list=images, example_size=params['example_size'], n_examples=params['n_examples']) for e in range(params['n_examples']): #print ('e: ', e) ## yield {'features': {'x': images[e].astype(np.float32)}, ## 'labels': {'y': y.astype(np.float32)}, ## 'img_id': subject_id} image_array.append(images[e].astype(np.float32)) label_array.append(y.astype(np.int32)) else: print('read_fn params yield last') ## yield {'features': {'x': images}, ## 'labels': {'y': y.astype(np.float32)}, ## 'img_id': subject_id} image_array.append(images) label_array.append(y.astype(np.int32)) print("read_fn yield output_array with image shape = ", images.shape, "label shape = ", y.shape) yield {'x': np.array(image_array), 'y': np.array(label_array)}
f = open('datetimestring.txt','w') f.writelines(current_datetime_string) f.close() #ADLS directories and file names scripts_adl_folder = '/inventory_scripts' configuration_adl_folder = '/configuration' configuration_file_name = 'Configurations.xlsx' configuration_adl_path = configuration_adl_folder + '/' + configuration_file_name # Create the ADLS client adl_token = lib.auth(tenant_id=_TENANT_ID, client_id=_CLIENT_ID, client_secret=_CLIENT_SECRET) adls_file_system_client = core.AzureDLFileSystem(adl_token, store_name=_ADL_NAME) #Download configuration file and scripts from Azure Data Lake Store to local multithread.ADLDownloader(adls_file_system_client, lpath='.', rpath=configuration_adl_path, overwrite=True) #Read downloaded configuration file configuration_file_path = os.path.realpath(os.path.join('./', configuration_file_name)) policy_all = pd.read_excel(configuration_file_path, 'InventoryPolicyConfig') solvers_all = pd.read_excel(configuration_file_path, 'SolverConfig') schedule_all = pd.read_excel(configuration_file_path, 'ScheduleConfig') policy_all = policy_all[policy_all['ActiveFlag'] == 1] #find policies to run in the current period policy_schedules_all = pd.merge(policy_all,schedule_all, left_on = 'ScheduleID_GenerateOrder',right_on = 'ScheduleID') policy_schedules_all['TriggerFlag'] = policy_schedules_all.apply(utils.check_job_trigger,1,args=(current_datetime_tuple,current_date)) active_policies = policy_schedules_all[policy_schedules_all['TriggerFlag'] == 1] active_policies_solvers = pd.merge(active_policies, solvers_all, on = 'SolverName')
## Create filesystem client for ADLS subscriptionId = 'dcf4a239-316e-416c-b36c-7d1e336fb0d7' adlsAccountName = 'adlatest2017adls' ## Make ADLS credentials adlCreds = lib.auth(tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47', resource='https://datalake.azure.net/') ## Create a filesystem client object adlsFileSystemClient = core.AzureDLFileSystem(adlCreds, store_name=adlsAccountName) ## Get the shp file from ADLS multithread.ADLDownloader(adlsFileSystemClient, 'shpfiles', 'tempdir', 4, 4194304, overwrite=True) ## Get the shapefile from the ADL downloader shpfile = 'tempdir/BICYCLE_PARKING_ON_STREET_WGS84.shp' ## Provide a name for the csv file csvfilename = 'testingcsv.csv' #Open files csvfile = open(csvfilename, 'wb') ds = ogr.Open(shpfile) lyr = ds.GetLayer() #Get field names