def __init__(self, subscription_id, rg_name, df_name, client_id, secret, tenant, mapping, sap_source, ir_reference, sap_con_string, sql_con_string): self.subscription_id = subscription_id self.rg_name = rg_name self.df_name = df_name self.client_id = client_id self.secret = secret self.tenant = tenant self.mapping = mapping self.schema, self.source = sap_source.split('.') self.rg_params = {'location': 'centralus'} self.df_params = {'location': 'centralus'} self.ds_name = 'RelationalTable1' self.dsOut_name = 'sap_hana_db' self.ir_reference = ir_reference = 'integrationRuntime3' # '{"host":"example.com","port":30015,"user":"******","password":"******"}' self.sap_con = json.loads(sap_con_string) self.sap_host = self.sap_con['host'] self.sap_username = self.sap_con['user'] self.sep_password = self.sap_con['password'] self.sap_port = int(self.sap_con['port']) #'Server=tcp:<<fqdbservername>>;Database=<<dbname>>;Uid=<<username>>@<<dbservername>>;Pwd=<<password>>;Encrypt=yes;Connection Timeout=30;' self.sql_con = sql_con_string credentials = ServicePrincipalCredentials(client_id=self.client_id, secret=self.secret, tenant=self.tenant) resource_client = ResourceManagementClient(credentials, subscription_id) self.adf_client = DataFactoryManagementClient(credentials, subscription_id)
def authToken(cId, cSecret, ten): try: credentials = ServicePrincipalCredentials(client_id=cId, secret=cSecret, tenant=ten) adf_client = DataFactoryManagementClient(credentials, subID) except Exception as e: print('Auth Token error: ', e) return credentials.token['access_token']
def __init__( self, specs_file: str, subscription_name: str, resource_group: str, factory_name: str, name: str, environment: str, ): self.resource_group = resource_group self.factory_name = factory_name self.name = name self.environment = environment self.credential = DefaultAzureCredential( exclude_visual_studio_code_credential=True) self.sub_client = SubscriptionClient(self.credential) self.subscription_id = self._get_subscription_id(subscription_name) self.adf_client = DataFactoryManagementClient( self.credential, subscription_id=self.subscription_id) self._specs = self._read_specs(specs_file, environment) self._config = self._get_config() self._verify_adf_setup()
def get_conn(self): """ :return: """ if self._adf_client: return self._adf_client key_path = os.environ.get('AZURE_AUTH_LOCATION', False) if not key_path: conn = self.get_connection(self.conn_id) key_path = conn.extra_dejson.get('key_path', False) if key_path: self.log.info('Getting connection using a JSON key file.') self._adf_client = get_client_from_auth_file( DataFactoryManagementClient, key_path) return self._adf_client self.log.info('Getting connection using a service principal.') credentials = ServicePrincipalCredentials( client_id=conn.login, secret=conn.password, tenant=conn.extra_dejson['tenantId']) self._adf_client = DataFactoryManagementClient( credentials, conn.extra_dejson['subscriptionId']) return self._adf_client
def adf_client(adf_config): """Creates an DataFactoryManagementClient object""" credentials = ServicePrincipalCredentials( client_id=adf_config["AZ_SERVICE_PRINCIPAL_ID"], secret=adf_config["AZ_SERVICE_PRINCIPAL_SECRET"], tenant=adf_config["AZ_SERVICE_PRINCIPAL_TENANT_ID"]) return DataFactoryManagementClient(credentials, adf_config["AZ_SUBSCRIPTION_ID"])
def adf_client(adf_config): """Creates an DataFactoryManagementClient object""" if adf_config["AZ_SERVICE_PRINCIPAL_ID"] is None: credentials = AzureCliCredential() else: credentials = ClientSecretCredential(client_id=adf_config["AZ_SERVICE_PRINCIPAL_ID"], client_secret=adf_config["AZ_SERVICE_PRINCIPAL_SECRET"], tenant_id=adf_config["AZ_SERVICE_PRINCIPAL_TENANT_ID"]) return DataFactoryManagementClient(credentials, adf_config["AZ_SUBSCRIPTION_ID"])
def __init__(self): self.schema = '_SYS_BIC' self.source = 'Temp/VBAP_DEMO' self.source_type = 'VIEW' self.sap_con = '{"host" : "40.87.84.72","port" : 30215,"user" : "system","password" : "Metro#123"}' self.sql_con = 'Server=tcp:yashtesting.database.windows.net;Database=test;Uid=yash@yashtesting;Pwd=Myageis@20;Encrypt=yes;Connection Timeout=30;' self.sql_odbc_con = 'Driver={ODBC Driver 17 for SQL Server};' + self.sql_con self.subscription_id = '938ace66-9598-4029-b6bb-429929b03761' self.rg_name = 'celebal_rnd' self.df_name = 'celebaladf' self.client_id = 'b628371b-654f-4848-b214-c8553f2fc665' self.secret = '/JCA4now2LAn1/L4aa+ICfmTumPRryW.' self.tenant = 'e4e34038-ea1f-4882-b6e8-ccd776459ca0' self.rg_params = {'location': 'eastus'} self.df_params = {'location': 'eastus'} self.credentials = ServicePrincipalCredentials( client_id=self.client_id, secret=self.secret, tenant=self.tenant) self.resource_client = ResourceManagementClient( self.credentials, self.subscription_id) self.adf_client = DataFactoryManagementClient(self.credentials, self.subscription_id) self.blob_dataset = 'AzureBlob1' self.input_dataset = 'RelationalTable2' self.output_dataset = 'AzureSqlTable3' self.staging_ls = 'LS_Sap_Hana' self.staging_path = 'testazure' self.creation_date = "ERDAT" self.change_date = "AEDAT" self.timestamp_staging_table = '[dbo].[Sap_hana_db_timestamp_staging]' self.timestamp_table = '[dbo].[Sap_hana_db_timestamp]' self.mapping = { "VBELN": "SalesDocument", "POSNR": "SalesDocumentItem", "MATNR": "Material", "MATKL": "MaterialGroup", "PSTYV": "SalesDocumentItemCat", "FKREL": "RelevantforBilling", "NETWR": "NetValue", "WAERK": "Currency", "KWMENG": "OrderQuantity", "LSMENG": "RequiredDelQuantity", "KBMENG": "ConfirmedDelQuantiy", "WERKS": "Plant", "PRCTR": "ProfitCenter", "ABSTA": "RejectionStatus", "GBSTA": "OverallStatus", "LFSTA": "DeliveryStatus", "ERDAT": "CreatedDate", "WAVWR": "Cost", "AEDAT": "UpdateDate" } self.translator = self.make_translator()
def get_adfclient(): subscription_id = 'a9645a3e-7a1d-4f88-9704-6e9f2e7b5d90' # Specify your Active Directory client ID, client secret, and tenant ID #appid: a03a061c-865b-40ad-b10b-b51996e1dc34 #key: RYi1miuBnif/r73Xr+7AiD68sq/5PUE+azTp1N1uS00= #directory id: 5c2f5846-dd75-4228-be41-51a280645298 credentials = ServicePrincipalCredentials( client_id='a03a061c-865b-40ad-b10b-b51996e1dc34', secret='RYi1miuBnif/r73Xr+7AiD68sq/5PUE+azTp1N1uS00=', tenant='5c2f5846-dd75-4228-be41-51a280645298') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) return (adf_client)
def createDataFactory(credentials): adf_client = DataFactoryManagementClient(credentials, SUBSCRIPTION_ID) # Create a data factory df_resource = Factory(location=DEPLOYMENT_REGION) df = adf_client.factories.create_or_update(RESOURCE_GROUP, DATA_FACTORY_NAME, df_resource) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(RESOURCE_GROUP, DATA_FACTORY_NAME) time.sleep(1) print("Created Data Factory")
def get_conn(self) -> DataFactoryManagementClient: if self._conn is not None: return self._conn conn = self.get_connection(self.conn_id) self._conn = DataFactoryManagementClient( credential=ClientSecretCredential( client_id=conn.login, client_secret=conn.password, tenant_id=conn.extra_dejson.get("tenantId")), subscription_id=conn.extra_dejson.get("subscriptionId"), ) return self._conn
def get_conn(self) -> DataFactoryManagementClient: if self._conn is not None: return self._conn conn = self.get_connection(self.conn_id) tenant = conn.extra_dejson.get('extra__azure_data_factory__tenantId') subscription_id = conn.extra_dejson.get('extra__azure_data_factory__subscriptionId') self._conn = DataFactoryManagementClient( credential=ClientSecretCredential( client_id=conn.login, client_secret=conn.password, tenant_id=tenant ), subscription_id=subscription_id, ) return self._conn
def main(): # Azure subscription ID subscription_id = '<Specify your Azure Subscription ID>' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = 'ADFTutorialResourceGroup' # The data factory name. It must be globally unique. df_name = '<Specify a name for the data factory. It must be globally unique>' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials(client_id='<Active Directory application/client ID>', secret='<client secret>', tenant='<Active Directory tenant ID>') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location':'eastus'} df_params = {'location':'eastus'}
def monitorAdf(): try: # Get subscription and service principle data from config file subscription_id = configMap['connections']['subscription_id'] rg_name = configMap['connections']['adf']['rg_name'] df_name = configMap['connections']['adf']['df_name'] df_pipeline_name = configMap['connections']['adf']['pipeline_name'] ad_client_id = configMap['connections']['service_principal'][ 'ad_clientid'] ad_client_secret = configMap['connections']['service_principal'][ 'ad_client_secret'] ad_tenantid = configMap['connections']['service_principal'][ 'ad_tenantid'] #Make credential object credentials = ServicePrincipalCredentials(client_id=ad_client_id, secret=ad_client_secret, tenant=ad_tenantid) adf_client = DataFactoryManagementClient(credentials, subscription_id) print('adf access success!') # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, df_pipeline_name, parameters={}) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) filter_params = RunFilterParameters( last_updated_after=datetime.now() - timedelta(1), last_updated_before=datetime.now() + timedelta(1)) query_response = adf_client.activity_runs.query_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, filter_params) activity_output = query_response.value[0].output # Upload the activity output to Blob storage createBlob(activity_output) except Exception as e: print('Error occurred while accessing adf', e)
def create_datafactory_and_run(files_and_tokens: Dict[str, str], connection_string: str, location: str, is_unittest: bool = False) -> None: """ Builds an Azure Data Factory to download the FastMRI dataset from AWS, and places them in Azure Blob Storage. :param location: The Azure location in which the Data Factory should be created (for example, "westeurope") :param files_and_tokens: A mapping from file name (like knee.tar.gz) to AWS access token. :param is_unittest: If True, download a small tar.gz file from github. If False, download the "real" fastMRI datafiles from AWS. :param connection_string: The connection string of the Azure storage where the downloaded data should be stored. """ azure_config = AzureConfig.from_yaml( yaml_file_path=fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) # The data factory name. It must be globally unique. data_factory_name = "fastmri-copy-data-" + uuid.uuid4().hex[:8] # Get either the Service Principal authentication, if those are set already, or use interactive auth in the browser azureid_auth = get_azure_auth(azure_config) # Create a data factory adf_client = DataFactoryManagementClient(azureid_auth, azure_config.subscription_id) df_resource = Factory(location=location) print(f"Creating data factory {data_factory_name}") df = adf_client.factories.create_or_update(azure_config.resource_group, data_factory_name, df_resource) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(azure_config.resource_group, data_factory_name) time.sleep(1) print("Data factory created") # Create a linked service pointing to where the downloads come from if is_unittest: http_service = LinkedServiceResource(properties=HttpLinkedService( url="https://github.com", enable_server_certificate_validation=True, authentication_type="Anonymous")) else: http_service = LinkedServiceResource(properties=HttpLinkedService( url="https://fastmri-dataset.s3.amazonaws.com/", enable_server_certificate_validation=True, authentication_type="Anonymous")) http_name = "AwsHttp" adf_client.linked_services.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, linked_service_name=http_name, linked_service=http_service) # Create a linked service that represents the sink (Azure blob storage) blob_storage_name = "AzureBlob" blob_storage = AzureBlobStorageLinkedService( connection_string=SecureString(value=connection_string)) blob_storage_service = LinkedServiceResource(properties=blob_storage) adf_client.linked_services.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, linked_service_name=blob_storage_name, linked_service=blob_storage_service) linked_blob_storage = LinkedServiceReference( reference_name=blob_storage_name) linked_http = LinkedServiceReference(reference_name=http_name) def download_and_uncompress(source_file_or_tuple: Union[str, Tuple[str, str]], target_folder: str) -> List[str]: """ Downloads a file from AWS and stores them in blob storage in its compressed form. From the compressed file in blob storage, it is then uncompressed, and written to a new folder in blob storage. For example, if 'target_folder' is 'foo', the uncompressed file will be written to folder 'foo', and the compressed raw data will be written to 'foo_compressed'. :param source_file_or_tuple: The name of the .tar.gz or .tar file to download, without any access tokens. If the name is a Tuple[str, str], the second tuple element is the "real" extension, for files where the extension is misleading. :param target_folder: The folder prefix in the target storage account. :return: A list of pipelines that this method created. """ if isinstance(source_file_or_tuple, str): source_file = source_file_or_tuple file_extension = "".join(Path(source_file).suffixes) correct_extension = file_extension elif isinstance(source_file_or_tuple, tuple): source_file, correct_extension = source_file_or_tuple file_extension = "".join(Path(source_file).suffixes) else: raise ValueError( f"Type of source_file_or_tuple not recognized: {type(source_file_or_tuple)}" ) source_file_with_correct_extension = source_file[:source_file.rfind( file_extension)] + correct_extension target_folder_compressed = target_folder + COMPRESSED_DATASET_SUFFIX if is_unittest: http_source = HttpServerLocation( relative_url="gulpjs/gulp/archive/v3.9.1.tar.gz") else: http_source = HttpServerLocation( relative_url=f"{source_file}{files_and_tokens[source_file]}") source_file_cleaned = source_file.replace(".", "_") # A dataset that reads the files from AWS as-is, no decompression source_compressed = BinaryDataset(linked_service_name=linked_http, location=http_source) source_compressed_name = f"{source_file_cleaned} on AWS" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=source_compressed_name, dataset=DatasetResource(properties=source_compressed)) # The sink for downloading the datasets as-is (compressed) blob_storage_compressed = AzureBlobStorageLocation( file_name=source_file_with_correct_extension, container=TARGET_CONTAINER, folder_path=target_folder_compressed) dest_compressed = BinaryDataset( linked_service_name=linked_blob_storage, location=blob_storage_compressed) dest_compressed_name = f"{source_file_cleaned} on Azure" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=dest_compressed_name, dataset=DatasetResource(properties=dest_compressed)) # A dataset that reads the files from blob storage and uncompresses on-the-fly if correct_extension == ".tar.gz": compression = DatasetTarGZipCompression() # By default, a folder gets created for each .tar.gzip file that is read. Disable that. compression_properties = TarGZipReadSettings( preserve_compression_file_name_as_folder=False) elif correct_extension == ".tar": compression = DatasetTarCompression() # By default, a folder gets created for each .tar file that is read. Disable that. compression_properties = TarReadSettings( preserve_compression_file_name_as_folder=False) else: raise ValueError( f"Unable to determine compression for file {source_file}") source_uncompressed = BinaryDataset( linked_service_name=linked_blob_storage, location=blob_storage_compressed, compression=compression) source_uncompressed_name = f"read {source_file_cleaned} and uncompress" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=source_uncompressed_name, dataset=DatasetResource(properties=source_uncompressed)) # The sink for downloading the datasets uncompressed final_dataset = BinaryDataset(linked_service_name=linked_blob_storage, location=AzureBlobStorageLocation( container=TARGET_CONTAINER, folder_path=target_folder)) final_name = f"save {source_file_cleaned} uncompressed" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=final_name, dataset=DatasetResource(properties=final_dataset)) # Copying from compressed source to compressed destination on blob storage download = CopyActivity( name=f"download {source_file_cleaned}", inputs=[DatasetReference(reference_name=source_compressed_name)], outputs=[DatasetReference(reference_name=dest_compressed_name)], source=HttpSource(), sink=BlobSink()) # Read the compressed file from blob storage and create an uncompressed dataset. # This should not create extra folder structure beyond what is already in the tar file - this is specified # in compression_properties binary_source = BinarySource(format_settings=BinaryReadSettings( compression_properties=compression_properties)) uncompress = CopyActivity( name=f"uncompress {source_file_cleaned}", inputs=[DatasetReference(reference_name=source_uncompressed_name)], outputs=[DatasetReference(reference_name=final_name)], source=binary_source, sink=BlobSink(), # Add a dependent activity: We first need to download depends_on=[ ActivityDependency(activity=download.name, dependency_conditions=["Succeeded"]) ]) # Create a pipeline that first downloads from AWS to blob storage, and then decompresses from blob storage # to another blob storage location pipeline = f"{source_file_cleaned} to folder {target_folder}" adf_client.pipelines.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, pipeline_name=pipeline, pipeline=PipelineResource(activities=[download, uncompress])) return [pipeline] file_list: FolderAndFileList = \ [("antonsctest", ["foo.tar.gz", "bar.tar"])] if is_unittest else files_to_download all_pipelines = [] print("Creating pipelines:") for target_folder, files in file_list: for file in files: pipelines = download_and_uncompress(file, target_folder=target_folder) for p in pipelines: print(f"Created pipeline {p}") all_pipelines.extend(pipelines) print("Starting all pipelines") run_ids_per_pipeline = {} for pipeline in all_pipelines: run_result = adf_client.pipelines.create_run( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, pipeline_name=pipeline) print(f"Started pipeline: {pipeline}") run_ids_per_pipeline[run_result.run_id] = pipeline print("Waiting for pipelines to complete") status_per_run = { run_id: "running" for run_id in run_ids_per_pipeline.keys() } while True: for run_id in run_ids_per_pipeline.keys(): if status_per_run[run_id]: pipeline_run = adf_client.pipeline_runs.get( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, run_id=run_id) status = pipeline_run.status if status == "Succeeded" or status == "Failed": print( f"Pipeline '{run_ids_per_pipeline[run_id]}' completed with status {status}" ) status_per_run[run_id] = "" else: status_per_run[run_id] = status remaining_runs = len([v for v in status_per_run.values() if v]) print(f"Remaining pipelines that are running: {remaining_runs}") if remaining_runs == 0: break time.sleep(30) utcnow = datetime.now(timezone.utc) filter_params = RunFilterParameters( last_updated_after=utcnow - timedelta(days=1), last_updated_before=utcnow + timedelta(days=1)) for run_id, pipeline in run_ids_per_pipeline.items(): query_response = adf_client.activity_runs.query_by_pipeline_run( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, run_id=run_id, filter_parameters=filter_params) run_status = query_response.value[0] print(f"Status for pipeline {pipeline}: {run_status.status}") if run_status.status == 'Succeeded': print(f"\tNumber of bytes read: {run_status.output['dataRead']}") print( f"\tNumber of bytes written: {run_status.output['dataWritten']}" ) print(f"\tCopy duration: {run_status.output['copyDuration']}") else: print(f"\tErrors: {run_status.error['message']}") print("All pipelines completed. Deleting data factory.") adf_client.factories.delete(azure_config.resource_group, data_factory_name)
def main(self): try: # Create the BlockBlockService that is used to call the Blob service for the storage account block_blob_service = BlockBlobService(account_name=accountname, account_key=accountkey) # Create a container called 'quickstartblobs'. container_name = blob_path block_blob_service.create_container(container_name) # Set the permission so the blobs are public. block_blob_service.set_container_acl( container_name, public_access=PublicAccess.Container) # choose file from system path = filepath print(container_name) head, tail = os.path.split(path) print(tail) # Upload the created file, use local_file_name for the blob name block_blob_service.create_blob_from_path(container_name, tail, path) # List the blobs in the container print("\nList blobs in the container") generator = block_blob_service.list_blobs(container_name) for blob in generator: print("\nBlob name: " + blob.name) except Exception as e: print(e) credentials = ServicePrincipalCredentials(client_id=client_id, secret=secret, tenant=tenant) adf_client = DataFactoryManagementClient(credentials, subscription_id) # Create a data factory df_resource = Factory(location='eastus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) BlobToBlob.print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service storage_string = SecureString(storage_account_details) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update( rg_name, df_name, ls_name, ls_azure_storage) BlobToBlob.print_item(ls) ds_ls = LinkedServiceReference(ls_name) ds_azure_blob = AzureBlobDataset(ds_ls, folder_path=blob_path, file_name=tail) ds = adf_client.datasets.create_or_update(rg_name, df_name, dsIn_name, ds_azure_blob) BlobToBlob.print_item(ds) dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) BlobToBlob.print_item(dsOut) # Create a copy activity blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(dsIn_name) dsOut_ref = DatasetReference(dsOut_name) p = ActivityPolicy() p.timeout = '3.00:00:00' p.retry = 2 p.retry_interval_in_seconds = 50 copy_activity = CopyActivity(name=act_name, description=act_description, enable_staging='false', enable_skip_incompatible_row='false', inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink, policy=p) params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) BlobToBlob.print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, {}) # Monitor the pipeilne run time.sleep(20) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\nPineLine Id:{}".format(pipeline_run.run_id)) print("\nPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list( adf_client.activity_runs.list_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) BlobToBlob.print_activity_run_details(activity_runs_paged[0])
def main(): # Azure subscription ID subscription_id = '<Azure subscription ID>' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = '<Azure resource group name>' # The data factory name. It must be globally unique. df_name = '<Data factory name>' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials( client_id='<AAD application ID>', secret='<AAD app authentication key>', tenant='<AAD tenant ID>') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'eastus'} df_params = {'location': 'eastus'} # create the resource group # comment out if the resource group already exits resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create a data factory df_resource = Factory(location='eastus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # Specify the name and key of your Azure Storage account storage_string = SecureString( 'DefaultEndpointsProtocol=https;AccountName=<Azure storage account>;AccountKey=<Azure storage authentication key>' ) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(ls_name) blob_path = 'adftutorial/inputpy' blob_filename = 'input.txt' ds_azure_blob = AzureBlobDataset(ds_ls, folder_path=blob_path, file_name=blob_filename) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = 'adftutorial/outputpy' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) # Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, {}) # Monitor the pipeilne run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list( adf_client.activity_runs.list_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0]) # Create a trigger tr_name = 'mytrigger' scheduler_recurrence = ScheduleTriggerRecurrence(frequency='Minute', interval='15', start_time=datetime.now(), end_time=datetime.now() + timedelta(1), time_zone='UTC') pipeline_parameters = { 'inputPath': 'adftutorial/inputpy', 'outputPath': 'adftutorial/outputpy' } pipelines_to_run = [] pipeline_reference = PipelineReference('copyPipeline') pipelines_to_run.append( TriggerPipelineReference(pipeline_reference, pipeline_parameters)) tr_properties = ScheduleTrigger(description='My scheduler trigger', pipelines=pipelines_to_run, recurrence=scheduler_recurrence) adf_client.triggers.create_or_update(rg_name, df_name, tr_name, tr_properties) # start the trigger adf_client.triggers.start(rg_name, df_name, tr_name)
source = source[0] target = target[0] # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group #rg_name = 'poc-westeurope-gp-data-rg' rg_name = 'poc-westeurope-gp-data-rg' # The data factory name. It must be globally unique. #df_name = 'poc-westeurope-gp-data-df-atradius' df_name = 'poc-westeurope-gp-data-df' #credentials = ServicePrincipalCredentials(client_id=client_id, secret=secret, tenant=tenant) credentials = get_azure_cli_credentials() resource_client = ResourceManagementClient(credentials[0], credentials[1]) adf_client = DataFactoryManagementClient(credentials[0], credentials[1]) rg_params = {'location':'westeurope'} df_params = {'location':'westeurope'} # Create database linked service ls_tgt_name = 'tgtazuresqldb' # Create an Azure Storage linked service ls_src_name = 'srcgrpblob' # Parameter File with list of tables with open('param_tables.json') as json_param_file: table_name = json.load(json_param_file)
def main(req: func.HttpRequest) -> func.HttpResponse: target_table = "PipelinePauseData" token = utilities.get_param(req, "token") table_service = utilities.setup_table_service( os.environ["AzureWebJobsStorage"], target_table, ) # Since we can't use authentication for the API we will check as # soon as possible if the token for the pipeline restart is valid. # if it is not we halt execution and return a 500 code. try: paused_pipeline = table_service.get_entity( table_name=target_table, partition_key="PauseData", row_key=token ) except AzureMissingResourceHttpError as e: raise exceptions.HttpError( str(e), func.HttpResponse(str(e), status_code=500) ) # acted_upon monitors if a token has already been used. We use it here to # block the second and further attempts at restarting. acted_upon = paused_pipeline["acted_upon"] has_expired = check_if_expired( paused_pipeline["Timestamp"], paused_pipeline["expiration_time"], ) if not acted_upon and not has_expired: logging.info(token) # DefaultAzureCredential does not work when manipulating ADF. It will # complain about a missing session method. # Remember to give the contributor role to the application. # Azure Portal -> Subscriptions -> IAM roles credentials = ServicePrincipalCredentials( client_id=os.environ["AZURE_CLIENT_ID"], secret=os.environ["AZURE_CLIENT_SECRET"], tenant=os.environ["AZURE_TENANT_ID"], ) subscription_id = os.environ["subscription_id"] adf_client = DataFactoryManagementClient(credentials, subscription_id) logging.info(adf_client) # The restart data is accessed via a lookup activity from within ADF run_response = restart_pipeline( adf_client=adf_client, resource_group=paused_pipeline["resource_group"], factory_name=paused_pipeline["factory_name"], pipeline_name=paused_pipeline["pipeline_name"], token=token, ) logging.info(run_response) # After running acted_upon is set to 1 paused_pipeline["acted_upon"] = 1 table_service.update_entity(target_table, paused_pipeline) # Retrieve and display success webpage. confirmation_site = ( ShareFileClient.from_connection_string( conn_str=os.environ["AzureWebJobsStorage"], share_name=paused_pipeline["share_name"], file_path=paused_pipeline["web_path"], ) .download_file() .readall() .decode("utf-8") ) return func.HttpResponse(confirmation_site, mimetype="text/html") else: # already acted_upon or expired return func.HttpResponse("Invalid token.", status_code=500,)
def datafactory(sourceconnectionstring, sinkconnectionstring): # Azure subscription ID subscription_id = options.subscription # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = options.resourcegroup # The data factory name. It must be globally unique. df_name = options.datafactory # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials(client_id=options.clientid, secret=options.clientsecret, tenant=options.tenantid) resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'westeurope'} df_params = {'location': 'westeurope'} #Create a data factory df_resource = Factory(location='westeurope') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) if options.integrationruntime is not None: integrationruntime = IntegrationRuntimeReference( reference_name=options.integrationruntime, parameters=None) source_ls_name = 'sourceLinkedService' if options.integrationruntime is not None: source_ls_azure_cosmos = CosmosDbMongoDbApiLinkedService( connection_string=sourceconnectionstring, database=options.sourcedatabasename, connect_via=integrationruntime) else: source_ls_azure_cosmos = CosmosDbMongoDbApiLinkedService( connection_string=sourceconnectionstring, database=options.sourcedatabasename) source_ls = adf_client.linked_services.create_or_update( rg_name, df_name, source_ls_name, source_ls_azure_cosmos) print_item(source_ls) # Create an Azure blob dataset (input) source_ds_name = 'sourceDS' source_ds_ls = LinkedServiceReference(reference_name=source_ls_name) collection = options.sourcecollectionname sourcedataset = CosmosDbMongoDbApiCollectionDataset( linked_service_name=source_ds_ls, collection=collection, schema=None) source_ds = adf_client.datasets.create_or_update(rg_name, df_name, source_ds_name, sourcedataset) print_item(source_ds) sink_ls_name = 'sinkLinkedService' if options.integrationruntime is not None: sink_ls_azure_blob = AzureStorageLinkedService( connection_string=sinkconnectionstring, connect_via=integrationruntime) else: sink_ls_azure_blob = CosmosDbMongoDbApiLinkedService( connection_string=sinkconnectionstring) sink_ls = adf_client.linked_services.create_or_update( rg_name, df_name, sink_ls_name, sink_ls_azure_blob) print_item(sink_ls) sink_ds_name = 'sinkDS' utc_datetime = datetime.utcnow() blob_filename = utc_datetime.strftime("%Y%m%d-%H%M%SZ") + '.json.gz' sink_ds_ls = LinkedServiceReference(reference_name=sink_ls_name) location = DatasetLocation(type='AzureBlobStorageLocation', folder_path=options.sinkcontainername, file_name=blob_filename) compression = DatasetGZipCompression(level='Optimal') sinkdataset = JsonDataset(linked_service_name=sink_ds_ls, compression=compression, location=location) sink_ds = adf_client.datasets.create_or_update(rg_name, df_name, sink_ds_name, sinkdataset) print_item(sink_ds) if options.incremental is not None: if options.incremental == "yes": # Create a copy activity currentime = datetime.now() currentobjectid = 'ObjectId' + '(' + '"' + format( int(time.mktime(currentime.timetuple())), 'x') + "0000000000000000" + '"' + ')' previoustime = (datetime.now() - timedelta(minutes=240)) previousobjectid = 'ObjectId' + '(' + '"' + format( int(time.mktime(previoustime.timetuple())), 'x') + "0000000000000000" + '"' + ')' filter = '{_id: {$gte:' + previousobjectid + ',$lt:' + currentobjectid + '}}' act_name = 'copyCosmostoblob' if options.incremental is not None: if options.incremental == "yes": cosmos_source = CosmosDbMongoDbApiSource(filter=filter) else: print("Set incremental to yes for this script to work properly") sys.exit(1) else: cosmos_source = CosmosDbMongoDbApiSource() blob_sink = BlobSink() dsin_ref = DatasetReference(reference_name=source_ds_name) dsOut_ref = DatasetReference(reference_name=sink_ds_name) copy_activity = CopyActivity(name=act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=cosmos_source, sink=blob_sink) #Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, parameters={}) time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) filter_params = RunFilterParameters( last_updated_after=datetime.now() - timedelta(1), last_updated_before=datetime.now() + timedelta(1)) query_response = adf_client.activity_runs.query_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, filter_params) print_activity_run_details(query_response.value[0])
_SUBSCRIPTION_ID = os.getenv("SUBSCRIPTION_ID") _CLIENT_ID = os.environ.get("CLIENT_ID") _CLIENT_KEY = os.environ.get("CLIENT_KEY") _RESOURCE_GROUP = os.environ.get("RESOURCE_GROUP") _DATA_FACTORY_NAME = os.environ.get("DATA_FACTORY_NAME") _TIME_ZONE = os.environ.get("TIME_ZONE") ISO8601_FORMAT = "%Y%m%dT%H:%M:%S" credentials = ServicePrincipalCredentials( client_id=_CLIENT_ID, secret=_CLIENT_KEY, tenant=_TENANT_ID ) adf_client = DataFactoryManagementClient(credentials, _SUBSCRIPTION_ID) def query_activities(run_id, runfilterparam): activities = adf_client.activity_runs.query_by_pipeline_run( _RESOURCE_GROUP, _DATA_FACTORY_NAME, run_id=run_id, filter_parameters=runfilterparam).value pipe_activities = [] for activity in activities: output_activity = { "activity_name":activity.activity_name, "status": activity.status, "input":activity.input,
def main(): parser = argparse.ArgumentParser(description="Library path in ADF") parser.add_argument("-r", "--resource_group", help="Resource group", required=True) parser.add_argument("-a", "--adf_name", help="ADF NAME", required=True) parser.add_argument("-p", "--adf_pipeline_name", help="ADF pipeline name", required=True) parser.add_argument("-o", "--output_file_path", help="Output file path", required=True) parser.add_argument("-pa", "--parameters", help="Parameters", required=False) args = parser.parse_args() resource_group = args.resource_group adf_name = args.adf_name adf_pipeline_name = args.adf_pipeline_name output_file_path = args.output_file_path parameters = args.parameters print(f"-resource_group is {resource_group}") print(f"-adf_name is {adf_name}") print(f"-adf_pipeline_name is {adf_pipeline_name}") print(f"-output_file_path is {output_file_path}") print(f"-parameters is {parameters}") credentials, subscription_id = get_azure_cli_credentials() # The data factory name. It must be globally unique. get_azure_cli_credentials() adf_client = DataFactoryManagementClient(credentials, subscription_id) # Create a pipeline run run_response = adf_client.pipelines.create_run(resource_group, adf_name, adf_pipeline_name, parameters=parameters) # Monitor the pipeline run time.sleep(5) pipeline_run = adf_client.pipeline_runs.get(resource_group, adf_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) filter_params = RunFilterParameters( last_updated_after=datetime.utcnow() - timedelta(1), last_updated_before=datetime.utcnow() + timedelta(1)) query_response = adf_client.activity_runs.query_by_pipeline_run( resource_group, adf_name, pipeline_run.run_id, filter_params) while query_response.value[0].status in ['InProgress']: print_activity_run_details(query_response.value[0]) time.sleep(3) query_response = adf_client.activity_runs.query_by_pipeline_run( resource_group, adf_name, pipeline_run.run_id, filter_params) print_activity_run_details(query_response.value[0])
def _create_client(credential: Credentials, subscription_id: str): return DataFactoryManagementClient( credential=credential, subscription_id=subscription_id, )
def getAdfClient(): subscription_id = app.config["SUBSCRIPTION_ID"] credentials = getCredentials() return DataFactoryManagementClient(credentials, subscription_id)
def main(): # Load parameters params = load_config() # Azure subscription ID subscription_id = params['subscription_id'] # '<subscription ID>' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = params['rg_name'] # '<resource group>' # The data factory name. It must be globally unique. df_name = params['df_name'] # '<factory name>' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ClientSecretCredential(client_id=params['client_id'], client_secret=params['client_secret'], tenant_id=params['tenant_id']) resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'westeurope'} df_params = {'location': 'westeurope'} # create the resource group # comment out if the resource group already exits # resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create a data factory df_resource = Factory(location='westeurope') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService001' # IMPORTANT: specify the name and key of your Azure Storage account. storage_string = SecureString(value=params['storage_string']) ls_azure_storage = LinkedServiceResource( properties=AzureStorageLinkedService(connection_string=storage_string)) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(reference_name=ls_name) blob_path = params['blob_path'] # '<container>/<folder path>' blob_filename = params['blob_filename'] # '<file name>' ds_azure_blob = DatasetResource( properties=AzureBlobDataset(linked_service_name=ds_ls, folder_path=blob_path, file_name=blob_filename)) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = params['blob_path_output'] # <container>/<folder path>' dsOut_azure_blob = DatasetResource(properties=AzureBlobDataset( linked_service_name=ds_ls, folder_path=output_blobpath)) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(reference_name=ds_name) dsOut_ref = DatasetReference(reference_name=dsOut_name) copy_activity = CopyActivity(name=act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) # Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, parameters={}) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) filter_params = RunFilterParameters( last_updated_after=datetime.now() - timedelta(1), last_updated_before=datetime.now() + timedelta(1)) query_response = adf_client.activity_runs.query_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, filter_params) print_activity_run_details(query_response.value[0])
def main(): # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials(client_id=APP_CLIENT_ID, secret=APP_SECRET, tenant=TENANT_ID) adf_client = DataFactoryManagementClient(credentials, SUBSCRIPTION_ID) # Get the data factory df = adf_client.factories.get(RESOURCE_GROUP, DATA_FACTORY_NAME) print_item(df) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # IMPORTANT: specify the name and key of your Azure Storage account. storage_string = SecureString( 'DefaultEndpointsProtocol=https;AccountName={};AccountKey={}'.format(STORAGE_NAME, STORAGE_KEY)) ls_azure_storage = AzureStorageLinkedService(connection_string=storage_string) ls = adf_client.linked_services.create_or_update(RESOURCE_GROUP, DATA_FACTORY_NAME, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(ls_name) blob_path= 'input' blob_filename = 'input.txt' ds_azure_blob= AzureBlobDataset(ds_ls, folder_path=blob_path, file_name = blob_filename) ds = adf_client.datasets.create_or_update(RESOURCE_GROUP, DATA_FACTORY_NAME, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = 'output' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(RESOURCE_GROUP, DATA_FACTORY_NAME, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name,inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) #Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(RESOURCE_GROUP, DATA_FACTORY_NAME, p_name, p_obj) print_item(p) #Create a pipeline run. run_response = adf_client.pipelines.create_run(RESOURCE_GROUP, DATA_FACTORY_NAME, p_name, { } ) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(RESOURCE_GROUP, DATA_FACTORY_NAME, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list(adf_client.activity_runs.list_by_pipeline_run(RESOURCE_GROUP, DATA_FACTORY_NAME, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0])
def main(): # Azure subscription ID subscription_id = '86d62b86-1ed2-45c1-8f6c-164c9b3db93a' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = 'sshResourceGroup' # The data factory name. It must be globally unique. df_name = 'sshDF' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials( client_id='276d4d10-d006-48e0-a360-572267e5d400', secret='aphJTM107vbXsrJLC9Ehsk9S2pLxysvGycWnxVE4pjc=', tenant='da67ef1b-ca59-4db2-9a8c-aa8d94617a16') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'westus'} df_params = {'location': 'westus'} # Create the resource group # Comment out if the resource group already exits resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create the resource group # Comment out if the resource group already exits resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create a data factory df_resource = Factory(location='westus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # IMPORTANT: specify the name and key of your Azure Storage account storage_string = SecureString( 'DefaultEndpointsProtocol=https;AccountName=sshstorageaccount03;AccountKey=et+CstqRneJos+tjRAZcjubCdc2kdpBhISErdyIG/t94iLcrYAKSdD0txJjvR3C7wCrSz+9mcdjJAl05jGlChw==' ) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'dset_in' ds_ls = LinkedServiceReference(ls_name) blob_path = 'playerscontainer/myteam' blob_filename = 'astroplayers.txt' ds_azure_blob = AzureBlobDataset(ds_ls, folder_path=blob_path, file_name=blob_filename) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'dset_out' output_blobpath = 'playerscontainer/output' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobPleayerstoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) # Create a pipeline with the copy activity p_name = 'copyPipeliness' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run. run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, {}) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list( adf_client.activity_runs.list_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0])
def main(): #load values project_folder = os.path.expanduser( '/home/admin1/Desktop/AzurePythonScript') # adjust as appropriate load_dotenv(os.path.join(project_folder, '.env')) # Azure subscription ID subscription_id = os.environ.get('AZURE_SUBSCRIPTION_ID') # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = 'ArunScriptResource' # The data factory name. It must be globally unique. df_name = 'TwitterFactoryPyScript' clientid = '' secretkey = '' tenantid = '' print(type(os.environ.get('AZURE_CLIENT_ID'))) clientid += os.environ.get('AZURE_CLIENT_ID') secretkey += os.environ.get('AZURE_CLIENT_SECRET') tenantid += os.environ.get('AZURE_TENANT_ID') # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials(client_id=clientid, secret=secretkey, tenant=tenantid) resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'eastus'} df_params = {'location': 'eastus'} # create the resource group # comment out if the resource group already exits resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create a data factory df_resource = Factory(location='eastus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'AzurePyScriptLinkedService' # Specify the name and key of your Azure Storage account #storage_string = SecureString( value= # 'DefaultEndpointsProtocol=https;AccountName=arunkafkastorage;AccountKey=tV6Yx8ngd36I6eu8Ow9Lklq7DDLKeFJuslOLnGaX6jD33zCr7AghPso3lkjXKh0SMNMy83NWoklaGRHJTMk/4A==;EndpointSuffix=core.windows.net') storage_string = SecureString( value= 'DefaultEndpointsProtocol=https;AccountName=arunstorage12;AccountKey=iFCTVZveS/XvhhHfL/Phpf/r3UM3CPwSBkEwiQWePdALeW9hamYc6mAEXQMeSjQVrAdCY19hfFlUBLmKbwsbog==;EndpointSuffix=core.windows.net' ) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(reference_name=ls_name) blob_path = 'adfv2tutorial/input' blob_filename = 'input.txt' ds_azure_blob = AzureBlobDataset(ds_ls, folder_path=blob_path, file_name=blob_filename) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = 'adfv2tutorial/output' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) # Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, {}) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) #activity_runs_paged = list(adf_client.activity_runs.list_by_pipeline_run(rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) activity_runs_paged = list( adf_client.activity_runs.query_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0])
def main(): # Azure subscription ID subscription_id = 'a1b8793b-91d4-42e0-9e7a-f55af294f275' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = 'newcopyBlobToSqlRg' # The data factory name. It must be globally unique. df_name = 'newcopyBlobToSqlDf' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials( client_id='31fe72da-bb34-4243-a365-288a003d57e9', secret='702f129a-cc5e-4b03-9a5d-362ee0a6d4e3', tenant='c80b7188-f79b-48e5-8008-f9402f981907') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'eastus'} df_params = {'location': 'eastus'} # # create the resource group # # comment out if the resource group already exits # resource_client.resource_groups.create_or_update(rg_name, rg_params) # # Create a data factory # df_resource = Factory(location='eastus') # df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) # print_item(df) # while df.provisioning_state != 'Succeeded': # df = adf_client.factories.get(rg_name, df_name) # time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # Specify the name and key of your Azure Storage account storage_string = SecureString( value= 'DefaultEndpointsProtocol=https;AccountName=copyblobtosqlstorage;AccountKey=WlOWgmkCT9a8FB2phDVEgZhCfsrP1p/ZT8pA9Rg63iHyXB2+cZcQmHb8h0g+d3c6WoLa1aDef4fCJ4szkj0ipg==' ) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'salary_details_in' ds_ls = LinkedServiceReference(reference_name=ls_name) blob_path = 'data-streaming-sync/csv/salaryDetails/' # blob_filename = 'input.txt' ds_azure_blob = AzureBlobDataset(linked_service_name=ds_ls, folder_path=blob_path) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure Sql database linked service ls_sql_name = 'sqlDatabaseLinkedService' rg_sql_name = 'cloud-shell-storage-southeastasia' df_sql_name = 'datafactoryBlobToSql' storage_string = 'Server = tcp:datafactorysync-kpi-server.database.windows.net, 1433;' + 'Database=datafactorysync_kpi' ls_azure_sql_storage = AzureSqlDatabaseLinkedService( connection_string=storage_string, password='******', service_principal_id='31fe72da-bb34-4243-a365-288a003d57e9', type='AzureSqlDatabase') ls = adf_client.linked_services.create_or_update(rg_sql_name, df_sql_name, ls_sql_name, ls_azure_sql_storage) print_item(ls) # Create an Azure sql database (output) dsOut_name = 'salary_details_out' ds_sql_ls = LinkedServiceReference(reference_name=ls_sql_name) ds_sql_table_name = 'dbo.Salary_Details' ds = adf_client.datasets.create_or_update(rg_sql_name, df_sql_name, ds_sql_ls, ds_sql_table_name) print_item(ds) # Create a copy activity act_name = 'copyBlobtoSql' blob_source = BlobSource() sql_sink = SqlSink() dsin_ref = DatasetReference(reference_name=ds_name) dsOut_ref = DatasetReference(reference_name=dsOut_name) copy_activity = CopyActivity(name=act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=sql_sink) # Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, parameters={}) # Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) filter_params = RunFilterParameters( last_updated_after=datetime.now() - timedelta(1), last_updated_before=datetime.now() + timedelta(1)) query_response = adf_client.activity_runs.query_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, filter_params) print_activity_run_details(query_response.value[0])
def main(): # Azure subscription ID subscription_id = '97427991-cf70-407e-92c8-19ce6406c848' # This program creates this resource group. If it's an existing resource group, comment out the code that creates the resource group rg_name = 'ADFTutorialResourceGroup' # The data factory name. It must be globally unique. df_name = 'MidTermVet' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials(client_id='Default Directory/fee31cfc-b74a-4daf-89e9-af494c350705', secret='MicroRu$h2112', tenant='fee31cfc-b74a-4daf-89e9-af494c350705') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location':'eastus'} df_params = {'location':'eastus'} # create the resource group # comment out if the resource group already exits resource_client.resource_groups.create_or_update(rg_name, rg_params) #Create a data factory df_resource = Factory(location='eastus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # IMPORTANT: specify the name and key of your Azure Storage account. storage_string = SecureString('DefaultEndpointsProtocol=https;AccountName=<storageaccountname>;AccountKey=<storageaccountkey>') ls_azure_storage = AzureStorageLinkedService(connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(ls_name) blob_path= 'adfv2tutorial/input' blob_filename = 'input.txt' ds_azure_blob= AzureBlobDataset(ds_ls, folder_path=blob_path, file_name = blob_filename) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = 'adfv2tutorial/output' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name,inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) #Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) #Create a pipeline run. run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, { } ) #Monitor the pipeline run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list(adf_client.activity_runs.list_by_pipeline_run(rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0])
def main(): # Azure subscription ID subscription_id = 'alxfed' # This program creates this resource group. If it's an existing resource group, # comment out the code that creates the resource group rg_name = 'lakemichigan' # <Azure resource group name>' # The data factory name. It must be globally unique. df_name = 'lakemichigan_datafactory' # Specify your Active Directory client ID, client secret, and tenant ID credentials = ServicePrincipalCredentials( client_id='657b13a2-da4d-44e1-9c0e-5763fd2d67a4', # <AD client ID>', secret='I23s]f1:.VN*2u+RWmRJZ/xBxe2dwFys', # <client secret>', tenant='0263cb8d-97fc-41e7-b762-40e2ea8dc1da') # <tenant ID>') resource_client = ResourceManagementClient(credentials, subscription_id) adf_client = DataFactoryManagementClient(credentials, subscription_id) rg_params = {'location': 'eastus'} df_params = {'location': 'eastus'} # create the resource group # comment out if the resource group already exits # resource_client.resource_groups.create_or_update(rg_name, rg_params) # Create a data factory df_resource = Factory(location='eastus') df = adf_client.factories.create_or_update(rg_name, df_name, df_resource) print_item(df) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(rg_name, df_name) time.sleep(1) # Create an Azure Storage linked service ls_name = 'storageLinkedService' # Specify the name and key of your Azure Storage account string = r'DefaultEndpointsProtocol=https;AccountName=lakemichigan;AccountKey={account_key}' storage_string = SecureString(string) ls_azure_storage = AzureStorageLinkedService( connection_string=storage_string) ls = adf_client.linked_services.create_or_update(rg_name, df_name, ls_name, ls_azure_storage) print_item(ls) # Create an Azure blob dataset (input) ds_name = 'ds_in' ds_ls = LinkedServiceReference(ls_name) blob_path = 'factory/input' blob_filename = 'input.txt' ds_azure_blob = AzureBlobDataset(ds_ls, folder_path=blob_path, file_name=blob_filename) ds = adf_client.datasets.create_or_update(rg_name, df_name, ds_name, ds_azure_blob) print_item(ds) # Create an Azure blob dataset (output) dsOut_name = 'ds_out' output_blobpath = 'factory/output' dsOut_azure_blob = AzureBlobDataset(ds_ls, folder_path=output_blobpath) dsOut = adf_client.datasets.create_or_update(rg_name, df_name, dsOut_name, dsOut_azure_blob) print_item(dsOut) # Create a copy activity act_name = 'copyBlobtoBlob' blob_source = BlobSource() blob_sink = BlobSink() dsin_ref = DatasetReference(ds_name) dsOut_ref = DatasetReference(dsOut_name) copy_activity = CopyActivity(act_name, inputs=[dsin_ref], outputs=[dsOut_ref], source=blob_source, sink=blob_sink) # Create a pipeline with the copy activity p_name = 'copyPipeline' params_for_pipeline = {} p_obj = PipelineResource(activities=[copy_activity], parameters=params_for_pipeline) p = adf_client.pipelines.create_or_update(rg_name, df_name, p_name, p_obj) print_item(p) # Create a pipeline run run_response = adf_client.pipelines.create_run(rg_name, df_name, p_name, {}) # Monitor the pipeilne run time.sleep(30) pipeline_run = adf_client.pipeline_runs.get(rg_name, df_name, run_response.run_id) print("\n\tPipeline run status: {}".format(pipeline_run.status)) activity_runs_paged = list( adf_client.activity_runs.list_by_pipeline_run( rg_name, df_name, pipeline_run.run_id, datetime.now() - timedelta(1), datetime.now() + timedelta(1))) print_activity_run_details(activity_runs_paged[0])