Пример #1
0
Файл: net.py Проект: up1/rally
def download(url, local_path, expected_size_in_bytes=None):
    """
    Downloads a single file from a URL to the provided local path.

    :param url: The remote URL specifying one file that should be downloaded. May be either a HTTP or HTTPS URL.
    :param local_path: The local file name of the file that should be downloaded.
    :param expected_size_in_bytes: The expected file size in bytes if known. It will be used to verify that all data have been downloaded.
    """
    tmp_data_set_path = local_path + ".tmp"
    try:
        with HTTP.request("GET",
                          url,
                          preload_content=False,
                          retries=10,
                          timeout=urllib3.Timeout(connect=45,
                                                  read=240)) as r, open(
                                                      tmp_data_set_path,
                                                      "wb") as out_file:
            shutil.copyfileobj(r, out_file)
    except:
        if os.path.isfile(tmp_data_set_path):
            os.remove(tmp_data_set_path)
        raise
    else:
        download_size = os.path.getsize(tmp_data_set_path)
        if expected_size_in_bytes is not None and download_size != expected_size_in_bytes:
            if os.path.isfile(tmp_data_set_path):
                os.remove(tmp_data_set_path)
            raise exceptions.DataError(
                "Download of [%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected. Please retry."
                % (local_path, download_size, expected_size_in_bytes))
        os.rename(tmp_data_set_path, local_path)
Пример #2
0
def mandatory(params, key, op):
    try:
        return params[key]
    except KeyError:
        raise exceptions.DataError(
            "Parameter source for operation '%s' did not provide the mandatory parameter '%s'. Please add it to your"
            " parameter source." % (str(op), key))
Пример #3
0
    def __call__(self, es, params):
        bulk_params = {}
        if "pipeline" in params:
            bulk_params["pipeline"] = params["pipeline"]

        with_action_metadata = params["action_metadata_present"]

        if with_action_metadata:
            # only half of the lines are documents
            bulk_size = len(params["body"]) // 2
            response = es.bulk(body=params["body"], params=bulk_params)
        else:
            bulk_size = len(params["body"])
            response = es.bulk(body=params["body"],
                               index=params["index"],
                               type=params["type"],
                               params=bulk_params)
        if response["errors"]:
            for idx, item in enumerate(response["items"]):
                if item["index"]["status"] != 201:
                    msg = "Could not bulk index. "
                    msg += "Error in line [%d]\n" % (idx + 1)
                    msg += "Bulk item: [%s]\n" % item
                    msg += "Buffer size is [%d]\n" % idx
                    raise exceptions.DataError(msg)
        return bulk_size, "docs"
Пример #4
0
def download_via_http(url, local_path, expected_size_in_bytes=None):
    tmp_data_set_path = local_path + ".tmp"
    http = urllib3.PoolManager()
    try:
        with http.request("GET",
                          url,
                          preload_content=False,
                          retries=10,
                          timeout=urllib3.Timeout(connect=45,
                                                  read=240)) as r, open(
                                                      tmp_data_set_path,
                                                      "wb") as out_file:
            shutil.copyfileobj(r, out_file)
    except:
        if os.path.isfile(tmp_data_set_path):
            os.remove(tmp_data_set_path)
        raise
    else:
        download_size = os.path.getsize(tmp_data_set_path)
        if expected_size_in_bytes is not None and download_size != expected_size_in_bytes:
            if os.path.isfile(tmp_data_set_path):
                os.remove(tmp_data_set_path)
            raise exceptions.DataError(
                "Download of [%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected. Please retry."
                % (local_path, download_size, expected_size_in_bytes))
        os.rename(tmp_data_set_path, local_path)
Пример #5
0
 def update(self, distribution_version):
     try:
         if self.remote:
             branch = versions.best_match(
                 git.branches(self.repo_dir, remote=self.remote),
                 distribution_version)
             if branch:
                 # Allow uncommitted changes iff we do not have to change the branch
                 self.logger.info(
                     "Checking out [%s] in [%s] for distribution version [%s].",
                     branch, self.repo_dir, distribution_version)
                 git.checkout(self.repo_dir, branch=branch)
                 self.logger.info(
                     "Rebasing on [%s] in [%s] for distribution version [%s].",
                     branch, self.repo_dir, distribution_version)
                 try:
                     git.rebase(self.repo_dir, branch=branch)
                     self.revision = git.head_revision(self.repo_dir)
                 except exceptions.SupplyError:
                     self.logger.exception(
                         "Cannot rebase due to local changes in [%s]",
                         self.repo_dir)
                     console.warn(
                         "Local changes in [%s] prevent %s update from remote. Please commit your changes."
                         % (self.repo_dir, self.resource_name))
                 return
             else:
                 msg = "Could not find %s remotely for distribution version [%s]. Trying to find %s locally." % \
                       (self.resource_name, distribution_version, self.resource_name)
                 self.logger.warning(msg)
         branch = versions.best_match(
             git.branches(self.repo_dir, remote=False),
             distribution_version)
         if branch:
             if git.current_branch(self.repo_dir) != branch:
                 self.logger.info(
                     "Checking out [%s] in [%s] for distribution version [%s].",
                     branch, self.repo_dir, distribution_version)
                 git.checkout(self.repo_dir, branch=branch)
                 self.revision = git.head_revision(self.repo_dir)
         else:
             self.logger.info(
                 "No local branch found for distribution version [%s] in [%s]. Checking tags.",
                 distribution_version, self.repo_dir)
             tag = self._find_matching_tag(distribution_version)
             if tag:
                 self.logger.info(
                     "Checking out tag [%s] in [%s] for distribution version [%s].",
                     tag, self.repo_dir, distribution_version)
                 git.checkout(self.repo_dir, branch=tag)
                 self.revision = git.head_revision(self.repo_dir)
             else:
                 raise exceptions.SystemSetupError(
                     "Cannot find %s for distribution version %s" %
                     (self.resource_name, distribution_version))
     except exceptions.SupplyError as e:
         tb = sys.exc_info()[2]
         raise exceptions.DataError("Cannot update %s in [%s] (%s)." %
                                    (self.resource_name, self.repo_dir,
                                     e.message)).with_traceback(tb)
Пример #6
0
def download(url,
             local_path,
             expected_size_in_bytes=None,
             progress_indicator=None):
    """
    Downloads a single file from a URL to the provided local path.

    :param url: The remote URL specifying one file that should be downloaded. May be either a HTTP or HTTPS URL.
    :param local_path: The local file name of the file that should be downloaded.
    :param expected_size_in_bytes: The expected file size in bytes if known. It will be used to verify that all data have been downloaded.
    :param progress_indicator A callable that can be use to report progress to the user. It is expected to take two parameters 
    ``bytes_read`` and ``total_bytes``. If not provided, no progress is shown. Note that ``total_bytes`` is derived from 
    the ``Content-Length`` header and not from the parameter ``expected_size_in_bytes``.
    """
    tmp_data_set_path = local_path + ".tmp"
    try:
        with __http().request("GET",
                              url,
                              preload_content=False,
                              retries=10,
                              timeout=urllib3.Timeout(connect=45,
                                                      read=240)) as r, open(
                                                          tmp_data_set_path,
                                                          "wb") as out_file:
            if r.status > 299:
                raise urllib.error.HTTPError(url, r.status, "", None, None)
            # noinspection PyBroadException
            try:
                size_from_content_header = int(r.getheader("Content-Length"))
                if expected_size_in_bytes is None:
                    expected_size_in_bytes = size_from_content_header
            except BaseException:
                size_from_content_header = None

            chunk_size = 2**16
            bytes_read = 0

            for chunk in r.stream(chunk_size):
                out_file.write(chunk)
                bytes_read += len(chunk)
                if progress_indicator and size_from_content_header:
                    progress_indicator(bytes_read, size_from_content_header)
    except BaseException:
        if os.path.isfile(tmp_data_set_path):
            os.remove(tmp_data_set_path)
        raise
    else:
        download_size = os.path.getsize(tmp_data_set_path)
        if expected_size_in_bytes is not None and download_size != expected_size_in_bytes:
            if os.path.isfile(tmp_data_set_path):
                os.remove(tmp_data_set_path)
            raise exceptions.DataError(
                "Download of [%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected. Please retry."
                % (local_path, download_size, expected_size_in_bytes))
        os.rename(tmp_data_set_path, local_path)
Пример #7
0
 def __call__(self, es, params):
     response = es.bulk(body=params["body"])
     if response["errors"]:
         for idx, item in enumerate(response["items"]):
             if item["index"]["status"] != 201:
                 msg = "Could not bulk index. "
                 msg += "Error in line [%d]\n" % (idx + 1)
                 msg += "Bulk item: [%s]\n" % item
                 msg += "Buffer size is [%d]\n" % idx
                 raise exceptions.DataError(msg)
     # at this point, the bulk will always contain a separate meta data line
     return len(params["body"]) // 2
Пример #8
0
 def update(self, distribution_version):
     try:
         if self.remote and not self.offline:
             branch = versions.best_match(
                 git.branches(self.repo_dir, remote=self.remote),
                 distribution_version)
             if branch:
                 # Allow uncommitted changes iff we do not have to change the branch
                 logger.info(
                     "Checking out [%s] in [%s] for distribution version [%s]."
                     % (branch, self.repo_dir, distribution_version))
                 git.checkout(self.repo_dir, branch=branch)
                 logger.info(
                     "Rebasing on [%s] in [%s] for distribution version [%s]."
                     % (branch, self.repo_dir, distribution_version))
                 try:
                     git.rebase(self.repo_dir, branch=branch)
                 except exceptions.SupplyError:
                     logger.exception(
                         "Cannot rebase due to local changes in [%s]" %
                         self.repo_dir)
                     console.warn(
                         "Local changes in [%s] prevent %s update from remote. Please commit your changes."
                         % (self.repo_dir, self.resource_name))
                 return
             else:
                 msg = "Could not find %s remotely for distribution version [%s]. Trying to find %s locally." % \
                       (self.resource_name, distribution_version, self.resource_name)
                 logger.warning(msg)
         branch = versions.best_match(
             git.branches(self.repo_dir, remote=False),
             distribution_version)
         if branch:
             logger.info(
                 "Checking out [%s] in [%s] for distribution version [%s]."
                 % (branch, self.repo_dir, distribution_version))
             git.checkout(self.repo_dir, branch=branch)
         else:
             raise exceptions.SystemSetupError(
                 "Cannot find %s for distribution version %s" %
                 (self.resource_name, distribution_version))
     except exceptions.SupplyError:
         tb = sys.exc_info()[2]
         raise exceptions.DataError(
             "Cannot update [%s] in [%s]." %
             (self.resource_name, self.repo_dir)).with_traceback(tb)
Пример #9
0
def download(url,
             local_path,
             expected_size_in_bytes=None,
             progress_indicator=None):
    """
    Downloads a single file from a URL to the provided local path.

    :param url: The remote URL specifying one file that should be downloaded. May be either a HTTP, HTTPS or S3 URL.
    :param local_path: The local file name of the file that should be downloaded.
    :param expected_size_in_bytes: The expected file size in bytes if known. It will be used to verify that all data have been downloaded.
    :param progress_indicator A callable that can be use to report progress to the user. It is expected to take two parameters
    ``bytes_read`` and ``total_bytes``. If not provided, no progress is shown. Note that ``total_bytes`` is derived from
    the ``Content-Length`` header and not from the parameter ``expected_size_in_bytes`` for downloads via HTTP(S).
    """
    tmp_data_set_path = local_path + ".tmp"
    try:
        scheme = urllib3.util.parse_url(url).scheme
        if scheme in ["s3", "gs"]:
            expected_size_in_bytes = download_from_bucket(
                scheme, url, tmp_data_set_path, expected_size_in_bytes,
                progress_indicator)
        else:
            expected_size_in_bytes = download_http(url, tmp_data_set_path,
                                                   expected_size_in_bytes,
                                                   progress_indicator)
    except BaseException:
        if os.path.isfile(tmp_data_set_path):
            os.remove(tmp_data_set_path)
        raise
    else:
        download_size = os.path.getsize(tmp_data_set_path)
        if expected_size_in_bytes is not None and download_size != expected_size_in_bytes:
            if os.path.isfile(tmp_data_set_path):
                os.remove(tmp_data_set_path)
            raise exceptions.DataError(
                "Download of [%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected. Please retry."
                % (local_path, download_size, expected_size_in_bytes))
        os.rename(tmp_data_set_path, local_path)
Пример #10
0
    def __call__(self, es, params):
        """
        Runs one bulk indexing operation.

        :param es: The Elasticsearch client.
        :param params: A hash with all parameters. See below for details.
        :return: A hash with meta data for this bulk operation. See below for details.

        It expects a parameter dict with the following mandatory keys:

        * ``body``: containing all documents for the current bulk request.
        * ``bulk-size``: the number of documents in this bulk.
        * ``action_metadata_present``: if ``True``, assume that an action and metadata line is present (meaning only half of the lines
        contain actual documents to index)
        * ``index``: The name of the affected index in case ``action_metadata_present`` is ``False``.
        * ``type``: The name of the affected type in case ``action_metadata_present`` is ``False``.

        The following keys are optional:

        * ``pipeline``: If present, runs the the specified ingest pipeline for this bulk.
        * ``detailed-results``: If ``True``, the runner will analyze the response and add detailed meta-data. Defaults to ``False``. Note
        that this has a very significant impact on performance and will very likely cause a bottleneck in the benchmark driver so please
        be very cautious enabling this feature. Our own measurements have shown a median overhead of several thousand times (execution time
         is in the single digit microsecond range when this feature is disabled and in the single digit millisecond range when this feature
         is enabled; numbers based on a bulk size of 500 elements and no errors). For details please refer to the respective benchmarks
         in ``benchmarks/driver``.


        Returned meta data
        `
        The following meta data are always returned:

        * ``index``: name of the affected index. May be `None` if it could not be derived.
        * ``bulk-size``: bulk size, e.g. 5.000.
        * ``bulk-request-size-bytes``: size of the full bulk requset in bytes
        * ``total-document-size-bytes``: size of all documents contained in the bulk request in bytes
        * ``weight``: operation-agnostic representation of the bulk size (used internally by Rally for throughput calculation).
        * ``unit``: The unit in which to interpret ``bulk-size`` and ``weight``. Always "docs".
        * ``success``: A boolean indicating whether the bulk request has succeeded.
        * ``success-count``: Number of successfully processed items for this request (denoted in ``unit``).
        * ``error-count``: Number of failed items for this request (denoted in ``unit``).

        If ``detailed-results`` is ``True`` the following meta data are returned in addition:

        * ``ops``: A hash with the operation name as key (e.g. index, update, delete) and various counts as values. ``item-count`` contains
          the total number of items for this key. Additionally, we return a separate counter each result (indicating e.g. the number of created
          items, the number of deleted items etc.).
        * ``shards_histogram``: An array of hashes where each hash has two keys: ``item-count`` contains the number of items to which a shard
          distribution applies and ``shards`` contains another hash with the actual distribution of ``total``, ``successful`` and ``failed``
          shards (see examples below).
        * ``bulk-request-size-bytes``: Total size of the bulk request body in bytes.
        * ``total-document-size-bytes``: Total size of all documents within the bulk request body in bytes.

        Here are a few examples:

        If ``detailed-results`` is ``False`` a typical return value is::

            {
                "index": "my_index",
                "weight": 5000,
                "unit": "docs",
                "bulk-size": 5000,
                "success": True,
                "success-count": 5000,
                "error-count": 0
            }

        Whereas the response will look as follow if there are bulk errors::

            {
                "index": "my_index",
                "weight": 5000,
                "unit": "docs",
                "bulk-size": 5000,
                "success": False,
                "success-count": 4000,
                "error-count": 1000
            }

        If ``detailed-results`` is ``True`` a typical return value is::


            {
                "index": "my_index",
                "weight": 5000,
                "unit": "docs",
                "bulk-size": 5000,
                "bulk-request-size-bytes": 2250000,
                "total-document-size-bytes": 2000000,
                "success": True,
                "success-count": 5000,
                "error-count": 0,
                "ops": {
                    "index": {
                        "item-count": 5000,
                        "created": 5000
                    }
                },
                "shards_histogram": [
                    {
                        "item-count": 5000,
                        "shards": {
                            "total": 2,
                            "successful": 2,
                            "failed": 0
                        }
                    }
                ]
            }

        An example error response may look like this::


            {
                "index": "my_index",
                "weight": 5000,
                "unit": "docs",
                "bulk-size": 5000,
                "bulk-request-size-bytes": 2250000,
                "total-document-size-bytes": 2000000,
                "success": False,
                "success-count": 4000,
                "error-count": 1000,
                "ops": {
                    "index": {
                        "item-count": 5000,
                        "created": 4000,
                        "noop": 1000
                    }
                },
                "shards_histogram": [
                    {
                        "item-count": 4000,
                        "shards": {
                            "total": 2,
                            "successful": 2,
                            "failed": 0
                        }
                    },
                    {
                        "item-count": 500,
                        "shards": {
                            "total": 2,
                            "successful": 1,
                            "failed": 1
                        }
                    },
                    {
                        "item-count": 500,
                        "shards": {
                            "total": 2,
                            "successful": 0,
                            "failed": 2
                        }
                    }
                ]
            }
        """
        detailed_results = params.get("detailed-results", False)
        index = params.get("index")

        bulk_params = {}
        if "pipeline" in params:
            bulk_params["pipeline"] = params["pipeline"]

        with_action_metadata = params["action_metadata_present"]
        try:
            bulk_size = params["bulk-size"]
        except KeyError:
            raise exceptions.DataError(
                "Bulk parameter source did not provide a 'bulk-size' parameter. Please add it to your parameter source."
            )

        if with_action_metadata:
            # only half of the lines are documents
            response = es.bulk(body=params["body"], params=bulk_params)
        else:
            response = es.bulk(body=params["body"],
                               index=index,
                               doc_type=params["type"],
                               params=bulk_params)

        stats = self.detailed_stats(
            params, bulk_size,
            response) if detailed_results else self.simple_stats(
                bulk_size, response)

        meta_data = {
            "index": str(index) if index else None,
            "weight": bulk_size,
            "unit": "docs",
            "bulk-size": bulk_size
        }
        meta_data.update(stats)
        if not stats["success"]:
            meta_data["error-type"] = "bulk"
        return meta_data
    def __call__(self, es, params):
        """
        Runs one bulk indexing operation against InfluxDB.

        :param es: The Elasticsearch client. Not used here
        :param params: A hash with all parameters. See below for details.
        :return: A hash with meta data for this bulk operation. See below for details.

        It expects a parameter dict with the following mandatory keys:

        * ``body``: containing all documents for the current bulk request.
        * ``bulk-size``: the number of documents in this bulk.
        * ``index``: the name of the database to insert this bulk request into. Defaults to 'rally'
        * ``time_precision``: the time precision used for points to be written. Defaults to 'ms'.
        """

        # Set database if this is not already set
        if 'index' in params:
            index = params['index']
        else:
            index = "rally"

        if not self._index or self._index != index:
            logger.info(
                "InfluxDBBulkRunner: Switching to database {}.".format(index))
            self._influxdb_client.switch_database(index)
            self._index = index

        try:
            bulk_size = params["bulk-size"]
        except KeyError:
            raise exceptions.DataError(
                "Bulk parameter source did not provide a 'bulk-size' parameter. Please add it to your parameter source."
            )

        if "time_precision" in params.keys():
            time_precision = params['time_precision']
        else:
            time_precision = 'ms'

        body_size = len(params['body'])
        if body_size != bulk_size:
            raise exceptions.DataError(
                "Bulk size ({}) does not correspond to the size of the body ({})."
                .format(bulk_size, body_size))

        response = {}
        if self._influxdb_client.write_points(params['body'],
                                              time_precision=time_precision,
                                              database=index,
                                              retention_policy=None,
                                              tags=None,
                                              batch_size=bulk_size,
                                              protocol='line'):
            response = {
                "weight": bulk_size,
                "unit": "docs",
                "bulk-size": bulk_size,
                "success": True,
                "success-count": bulk_size,
                "error-count": 0
            }
        else:
            response = {
                "weight": bulk_size,
                "unit": "docs",
                "bulk-size": bulk_size,
                "success": False,
                "success-count": 0,
                "error-count": bulk_size
            }

        return response