示例#1
0
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid):
    ingest_source_id=str(uuid.uuid4())
    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = APP_AAD_TENANT_ID
    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"),
             JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"),
             JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"),
             JsonColumnMapping("rawdata", "$", "dynamic")]
        
    INGESTION_PROPERTIES  = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY)                                                                                                                                                          

    print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE))
    
    BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN

    print (BLOB_PATH,' ',str(file_size), ingest_source_id)
    BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes
    INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES)
    tc.context.properties["ingest_source_id"]=ingest_source_id

    min_datatime=0
    max_datatime=0
    total_records=1

    doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc)

    tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime })
    log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id)
    print(log_msg)
    tc.track_trace(log_msg)
    tc.flush()
    def test_blob_info_csv_mapping(self):
        """Tests serialization of csv ingestion blob info."""
        validation_policy = ValidationPolicy(
            ValidationOptions.ValidateCsvInputConstantColumns,
            ValidationImplications.BestEffort)
        columnMapping = ColumnMapping("ColumnName", "cslDataType", ordinal=1)

        properties = IngestionProperties(
            database="database",
            table="table",
            dataFormat=DataFormat.CSV,
            ingestionMapping=[columnMapping],
            additionalTags=["tag"],
            ingestIfNotExists=["ingestIfNotExistTags"],
            ingestByTags=["ingestByTags"],
            dropByTags=["dropByTags"],
            flushImmediately=True,
            reportLevel=ReportLevel.DoNotReport,
            reportMethod=ReportMethod.Queue,
            validationPolicy=validation_policy,
        )
        blob = BlobDescriptor("somepath", 10)
        blob_info = _IngestionBlobInfo(blob,
                                       properties,
                                       auth_context="authorizationContextText")
        self._verify_ingestion_blob_info_result(blob_info.to_json())
示例#3
0
def ingest_to_ADX(filepath, filesize):
    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(
        DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = AAD_TENANT_ID

    KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication(
        URI)
    KCSB_ENGINE.authority_id = AAD_TENANT_ID

    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    INGESTION_PROPERTIES = IngestionProperties(
        database=DATABASE,
        table=DESTINATION_TABLE,
        dataFormat=DataFormat.CSV,
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        additionalProperties={'ignoreFirstRecord': 'true'},
        reportLevel=ReportLevel.FailuresAndSuccesses)
    BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN

    BLOB_DESCRIPTOR = BlobDescriptor(
        BLOB_PATH, filesize)  # 10 is the raw size of the data in bytes
    INGESTION_CLIENT.ingest_from_blob(
        BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)

    print('Done queuing up ingestion with Azure Data Explorer ' + filepath)
    def test_blob_ingestion(self, mock_uuid, mock_put_message_in_queue,
                            mock_aad):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=queued_request_callback,
            content_type="application/json")

        ingest_client = ManagedStreamingIngestClient.from_dm_kcsb(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table")

        blob_path = (
            "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?sp=rl&st=2020-05-20T13"
            "%3A38%3A37Z&se=2020-05-21T13%3A38%3A37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx "
        )
        result = ingest_client.ingest_from_blob(
            BlobDescriptor(blob_path, 1),
            ingestion_properties=ingestion_properties)

        assert result.status == IngestionStatus.QUEUED

        assert_queued_upload(
            mock_put_message_in_queue,
            mock_upload_blob_from_stream=None,
            expected_url=
            "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?",
        )
示例#5
0
 def test_blob_info_json_mapping(self):
     """ Tests serialization of json ingestion blob info. """
     validation_policy = ValidationPolicy(
         ValidationOptions.ValidateCsvInputConstantColumns,
         ValidationImplications.BestEffort)
     properties = IngestionProperties(
         database="database",
         table="table",
         dataFormat=DataFormat.json,
         mapping=[JsonColumnMapping("ColumnName", "jsonpath", "datatype")],
         additionalTags=["tag"],
         ingestIfNotExists=["ingestIfNotExistTags"],
         ingestByTags=["ingestByTags"],
         dropByTags=["dropByTags"],
         flushImmediately=True,
         reportLevel=ReportLevel.DoNotReport,
         reportMethod=ReportMethod.QueueAndTable,
         validationPolicy=validation_policy,
     )
     blob = BlobDescriptor("somepath", 10)
     blob_info = _IngestionBlobInfo(blob,
                                    properties,
                                    deleteSourcesOnSuccess=True,
                                    authContext="authorizationContextText")
     self._verify_ingestion_blob_info_result(blob_info.to_json())
示例#6
0
        def ingest_from_blob(cls,
                             ingest_client: QueuedIngestClient,
                             database_name: str,
                             table_name: str,
                             blob_url: str,
                             data_format: DataFormat,
                             mapping_name: str = None) -> None:
            """
            Ingest Data from a Blob.
            :param ingest_client: Client to ingest data
            :param database_name: DB name
            :param table_name: Table name
            :param blob_url: Blob Uri
            :param data_format: Given data format
            :param mapping_name: Desired mapping name
            """
            ingestion_properties = cls.create_ingestion_properties(
                database_name, table_name, data_format, mapping_name)

            # Tip 1: For optimal ingestion batching and performance,specify the uncompressed data size in the file descriptor instead of the default below of 0.
            # Otherwise, the service will determine the file size, requiring an additional s2s call, and may not be accurate for compressed files.
            # Tip 2: To correlate between ingestion operations in your applications and Kusto, set the source ID and log it somewhere
            blob_descriptor = BlobDescriptor(blob_url,
                                             size=0,
                                             source_id=str(uuid.uuid4()))
            ingest_client.ingest_from_blob(
                blob_descriptor, ingestion_properties=ingestion_properties)
 def test_blob_info_json_mapping(self):
     """Tests serialization of json ingestion blob info."""
     validation_policy = ValidationPolicy(
         ValidationOptions.ValidateCsvInputConstantColumns,
         ValidationImplications.BestEffort)
     properties = IngestionProperties(
         database="database",
         table="table",
         data_format=DataFormat.JSON,
         column_mappings=[
             ColumnMapping("ColumnName", "datatype", path="jsonpath")
         ],
         additional_tags=["tag"],
         ingest_if_not_exists=["ingestIfNotExistTags"],
         ingest_by_tags=["ingestByTags"],
         drop_by_tags=["dropByTags"],
         flush_immediately=True,
         report_level=ReportLevel.DoNotReport,
         report_method=ReportMethod.Queue,
         validation_policy=validation_policy,
     )
     blob = BlobDescriptor("somepath", 10)
     blob_info = IngestionBlobInfo(blob,
                                   properties,
                                   auth_context="authorizationContextText")
     self._verify_ingestion_blob_info_result(blob_info.to_json())
示例#8
0
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name,
                  blob_account, tc):
    ingest_source_id = str(uuid.uuid4())
    #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length
    #print (filepath+" File Size "+str(file_size))

    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(
        DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = APP_AAD_TENANT_ID

    vm_uuid, config_uuid, deploy_uuid, file_size, min_datatime, max_datatime, total_records = get_uuids_from_csv(
        telemetry_block_blob_service, container_name, filepath)
    dropByTag = vm_uuid + '_' + config_uuid + '_' + deploy_uuid

    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    INGESTION_PROPERTIES = IngestionProperties(
        database=DATABASE,
        table=DESTINATION_TABLE,
        dataFormat=DataFormat.CSV,
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        additionalProperties={
            'ignoreFirstRecord': 'true',
            'reportMethod': 'QueueAndTable'
        },
        reportLevel=ReportLevel.FailuresAndSuccesses,
        dropByTags=[dropByTag],
        flushImmediately=IS_FLUSH_IMMEDIATELY)

    BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN
    #print (BLOB_PATH,' ',str(file_size))
    BLOB_DESCRIPTOR = BlobDescriptor(
        BLOB_PATH, file_size,
        ingest_source_id)  # 10 is the raw size of the data in bytes

    INGESTION_CLIENT.ingest_from_blob(
        BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)

    tc.context.properties["ingest_source_id"] = str(ingest_source_id)

    doc_id = save_COSMOS_log(vm_uuid, deploy_uuid, config_uuid, filepath,
                             min_datatime, max_datatime, total_records,
                             ingest_source_id, blob_account, container_name,
                             tc)

    tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, {
        'FILE_PATH': filepath,
        'DOC_ID': doc_id,
        "SOURCE_ID": ingest_source_id
    }, {
        'TOTOAL_RECORDS': total_records,
        'FILE_SIZE': file_size,
        'MIN_DATETIME': min_datatime,
        'MAX_DATETIME': max_datatime
    })
    log_msg = "{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(
        LOG_MESSAGE_HEADER, filepath, ingest_source_id)
    print(log_msg)
    tc.track_trace(log_msg)
    tc.flush()
def ingestBlob(client,db,blob,properties):
    INGESTION_PROPERTIES = IngestionProperties(database=db, table=blob['table'], dataFormat=DataFormat(blob['format']), mappingReference=blob['ingestionMapping'], additionalProperties=properties, reportLevel=ReportLevel.FailuresAndSuccesses)
    BLOB_DESCRIPTOR = BlobDescriptor(blob['path'],blob['size'])
    try:
        client.ingest_from_blob(BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)
        logging.info("Blob %s ingested succesfully."%blob['name'])
    except Exception as e:
        logging.error("Error ingesting blob %s: %s"%(blob['name'],e))
示例#10
0
    def test_uuid_blob_descriptor(self):
        dummy_file = "dummy"

        descriptor = BlobDescriptor(dummy_file)
        assert descriptor.source_id
        assert descriptor.source_id != TestDescriptors.TEST_UUID
        assert uuid.UUID(str(descriptor.source_id), version=4)

        descriptor = BlobDescriptor(dummy_file,
                                    source_id=TestDescriptors.TEST_UUID_STR)
        assert descriptor.source_id == TestDescriptors.TEST_UUID

        descriptor = BlobDescriptor(dummy_file,
                                    source_id=TestDescriptors.TEST_UUID)
        assert descriptor.source_id == TestDescriptors.TEST_UUID

        with pytest.raises(ValueError):
            BlobDescriptor(dummy_file, source_id=TestDescriptors.INVALID_UUID)
示例#11
0
def ingest(file, size):
    props = IngestionProperties(
        database="GitHub",
        table="GithubEvent",
        dataFormat=DataFormat.json,
        mapping=mapping,
        ingestIfNotExists=[file],
        ingestByTags=[file],
        dropByTags=[file[57:67]],
    )

    client.ingest_from_blob(BlobDescriptor(file, size), props)

    print("ingested {}".format(file))
示例#12
0
def ingest_to_adx(file_path, file_size, target_database, target_table, \
    msg_time, modification_time):
    """
    Trigger ADX to ingest the specified file in Azure Data Lake
    Prepare ADX ingestion meta-data
    :param file_path: The full path of blob file
    :param file_size: The full size of blob file
    :param target_database: The target database
    :param target_table: The target table
    :param msg_time: The msg_time from eventgrid
    :param azure_telemetry_client: The telemetry client used for sending telemetry of the ingest function
    :return: None
    """
    logging.info(f'{LOG_MESSAGE_HEADER} start to ingest to adx')
    ingest_source_id = str(uuid.uuid4())
    if SOURCE_TELEMETRY_FILE_TOKEN.startswith('?'):
        blob_path = file_path +  SOURCE_TELEMETRY_FILE_TOKEN
    else:
        blob_path = file_path + '?' + SOURCE_TELEMETRY_FILE_TOKEN
    logging.info(f"{LOG_MESSAGE_HEADER} blob_path:{blob_path}, ingest_source_id:{ingest_source_id}")
    logging.info('%s FILEURL : %s, INGESTION URL: %s, Database: %s, \
                    Table: %s, FILESIZE: %s, msg_time: %s, modification_time: %s', \
                    LOG_MESSAGE_HEADER, blob_path, INGESTION_SERVER_URI, \
                    target_database, target_table, file_size, msg_time, modification_time)
    
    ingestion_properties = IngestionProperties(database=target_database, table=target_table, \
                                            dataFormat=DataFormat.JSON, \
                                            ingestion_mapping_reference=INGESTION_MAPPING, \
                                            reportLevel=ReportLevel.FailuresAndSuccesses, \
                                            additionalProperties={'reportMethod': 'QueueAndTable', \
                                                    "creationTime": msg_time.strftime( \
                                                    "%Y-%m-%d %H:%M"), "modificationTime": modification_time.strftime( \
                                                    "%Y-%m-%d %H:%M")}, \
                                            flushImmediately=IS_FLUSH_IMMEDIATELY)



    blob_descriptor = BlobDescriptor(blob_path, file_size, \
                                    ingest_source_id)  # 10 is the raw size of the data in bytes
    logging.info(f"{LOG_MESSAGE_HEADER} start to ingest to queue")
    start_time = time.time()
    KUSTO_INGESTION_CLIENT.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_properties)
    logging.info(f"{LOG_MESSAGE_HEADER} ingest process time {time.time()-start_time}")

    return ingest_source_id
 def test_blob_json_mapping_reference(self):
     """Tests serialization of ingestion blob info with json mapping reference."""
     validation_policy = ValidationPolicy(
         ValidationOptions.ValidateCsvInputConstantColumns,
         ValidationImplications.BestEffort)
     properties = IngestionProperties(
         database="database",
         table="table",
         dataFormat=DataFormat.JSON,
         mappingReference="jsonMappingReference",
         additionalTags=["tag"],
         ingestIfNotExists=["ingestIfNotExistTags"],
         ingestByTags=["ingestByTags"],
         dropByTags=["dropByTags"],
         flushImmediately=True,
         reportLevel=ReportLevel.DoNotReport,
         reportMethod=ReportMethod.Queue,
         validationPolicy=validation_policy,
     )
     blob = BlobDescriptor("somepath", 10)
     blob_info = _IngestionBlobInfo(blob,
                                    properties,
                                    auth_context="authorizationContextText")
     self._verify_ingestion_blob_info_result(blob_info.to_json())
示例#14
0
                                             create_mapping_command)

    ingestion_client = KustoIngestClient(kcsb_ingest)

    # All ingestion properties: https://docs.microsoft.com/en-us/azure/kusto/management/data-ingestion/#ingestion-properties
    ingestion_props = IngestionProperties(
        reportLevel=reportLevel,
        database=kusto_database,
        table=destination_table,
        dataFormat=DataFormat.csv,
        mappingReference=column_mapping_name,
        additionalProperties={'ignoreFirstRecord': 'true'})
    blobProps = BlockBlobService.get_blob_properties(blob_service, container,
                                                     file_name).properties
    file_size = blobProps.content_length
    blob_descriptor = BlobDescriptor(
        blob_path, file_size)  # Raw size of the data in bytes

    ingestion_client.ingest_from_blob(blob_descriptor,
                                      ingestion_properties=ingestion_props)

    print(
        """Queued blob '{FILE_NAME}' ({FILE_SIZE} bytes) for ingestion into ADX table '{DESTINATION_TABLE}'"""
        .format(FILE_NAME=file_name,
                FILE_SIZE=file_size,
                DESTINATION_TABLE=destination_table))

    # query = """{} | count""".format(destination_table)

    # response = kusto_client.execute_query(kusto_database, query)

    # count_query_df = dataframe_from_result_table(response.primary_results[0])
示例#15
0
    # in case status update for success are also required
    # report_level=ReportLevel.FailuresAndSuccesses,
    # in case a mapping is required
    # ingestion_mapping_reference="{json_mapping_that_already_exists_on_table}"
    # ingestion_mapping_type=IngestionMappingType.JSON
)

# ingest from file
file_descriptor = FileDescriptor(
    "{filename}.csv", 3333)  # 3333 is the raw size of the data in bytes.
client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)
client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props)

# ingest from blob
blob_descriptor = BlobDescriptor(
    "https://{path_to_blob}.csv.gz?sp=rl&st=2020-05-20T13:38:37Z&se=2020-05-21T13:38:37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
    10,
)  # 10 is the raw size of the data in bytes.
client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props)

# ingest from dataframe
import pandas

fields = ["id", "name", "value"]
rows = [[1, "abc", 15.3], [2, "cde", 99.9]]

df = pandas.DataFrame(data=rows, columns=fields)

client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)

# ingest a whole folder.
import os
示例#16
0
    dataFormat=DataFormat.CSV,
    # in case status update for success are also required
    # reportLevel=ReportLevel.FailuresAndSuccesses,
    # in case a mapping is required
    # ingestionMappingReference="{json_mapping_that_already_exists_on_table}"
    # ingestionMappingType=IngestionMappingType.Json
)

# ingest from file
file_descriptor = FileDescriptor("{filename}.csv", 3333)  # 3333 is the raw size of the data in bytes.
client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)
client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props)


# ingest from blob
blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10)  # 10 is the raw size of the data in bytes.
client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props)

# ingest from dataframe
import pandas

fields = ["id", "name", "value"]
rows = [[1, "abc", 15.3], [2, "cde", 99.9]]

df = pandas.DataFrame(data=rows, columns=fields)

client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)

# ingest a whole folder.
import os
示例#17
0
INGESTION_PROPERTIES = IngestionProperties(database="database name",
                                           table="table name",
                                           dataFormat=DataFormat.csv)

INGEST_CLIENT = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net")

KCSB = KustoConnectionStringBuilder.with_aad_application_key_authentication(
    "https://ingest-<clustername>.kusto.windows.net", "aad app id", "secret")
INGEST_CLIENT = KustoIngestClient(KCSB)

FILE_DESCRIPTOR = FileDescriptor(
    "E:\\filePath.csv", 3333)  # 3333 is the raw size of the data in bytes.
INGEST_CLIENT.ingest_from_multiple_files(
    [FILE_DESCRIPTOR],
    delete_sources_on_success=True,
    ingestion_properties=INGESTION_PROPERTIES)

INGEST_CLIENT.ingest_from_multiple_files(
    ["E:\\filePath.csv"],
    delete_sources_on_success=True,
    ingestion_properties=INGESTION_PROPERTIES)

BLOB_DESCRIPTOR = BlobDescriptor(
    "https://path-to-blob.csv.gz?sas",
    10)  # 10 is the raw size of the data in bytes.
INGEST_CLIENT.ingest_from_multiple_blobs(
    [BLOB_DESCRIPTOR],
    delete_sources_on_success=True,
    ingestion_properties=INGESTION_PROPERTIES)
示例#18
0
ingestion_properties = IngestionProperties(database="database name",
                                           table="table name",
                                           dataFormat=DataFormat.csv)

ingest_client = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net")
ingest_client = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net",
    client_id="aad app id",
    client_secret="secret")

file_descriptor = FileDescriptor(
    "E:\\filePath.csv", 3333)  # 3333 is the raw size of the data in bytes.
ingest_client.ingest_from_multiple_files(
    [file_descriptor],
    delete_sources_on_success=True,
    ingestion_properties=ingestion_properties)

ingest_client.ingest_from_multiple_files(
    ["E:\\filePath.csv"],
    delete_sources_on_success=True,
    ingestion_properties=ingestion_properties)

blob_descriptor = BlobDescriptor(
    "https://path-to-blob.csv.gz?sas",
    10)  # 10 is the raw size of the data in bytes.
ingest_client.ingest_from_multiple_blobs(
    [blob_descriptor],
    delete_sources_on_success=True,
    ingestion_properties=ingestion_properties)