Exemplo n.º 1
0
    def test_skip_rows_env(self):
        """test whether or not index skips rows per SKIP_ROWS_EXTS=LIST"""
        # because of module caching we can't just patch the environment variable
        # since index.SKIP_ROWS_EXTS will never change after import
        with patch.dict(os.environ, {'SKIP_ROWS_EXTS': '.txt,.csv'}):
            exts = separated_env_to_iter('SKIP_ROWS_EXTS')
            with patch('index.SKIP_ROWS_EXTS', exts):
                assert '.parquet' not in exts
                assert '.csv' in exts
                assert '.txt' in exts

        with patch.dict(os.environ, {'SKIP_ROWS_EXTS': '.parquet,.tsvl'}):
            exts = separated_env_to_iter('SKIP_ROWS_EXTS')
            with patch('index.SKIP_ROWS_EXTS', exts):
                assert '.parquet' in exts
                assert '.csv' not in exts
Exemplo n.º 2
0
 def test_separated_env_to_iter(self):
     """ensure the function that infers overrides from the env works:
         always returns a valid set(), perhaps empty, lowercases extensions
     """
     with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': '.txt'}):
         assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.txt'}
     with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' .tXt   '}):
         assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.txt'}
     with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' garbage  gar.bage  '}):
         assert separated_env_to_iter(
             'CONTENT_INDEX_EXTS',
             predicate=lambda x: x.startswith('.')
         ) == set()
     with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ' .Parquet, .csv, .tsv'}):
         assert separated_env_to_iter('CONTENT_INDEX_EXTS') == {'.parquet', '.csv', '.tsv'}
     with patch.dict(os.environ, {'CONTENT_INDEX_EXTS': ''}):
         assert separated_env_to_iter('CONTENT_INDEX_EXTS') == set(), \
             "Invalid sets should be empty and falsy"
Exemplo n.º 3
0
from t4_lambda_shared.utils import (
    get_available_memory,
    get_quilt_logger,
    MANIFEST_PREFIX_V1,
    POINTER_PREFIX_V1,
    query_manifest_content,
    separated_env_to_iter,
)

from document_queue import (DocTypes, DocumentQueue, CONTENT_INDEX_EXTS,
                            EVENT_PREFIX, MAX_RETRY)

# 10 MB, see https://amzn.to/2xJpngN
NB_VERSION = 4  # default notebook version for nbformat
# currently only affects .parquet, TODO: extend to other extensions
SKIP_ROWS_EXTS = separated_env_to_iter('SKIP_ROWS_EXTS')
SELECT_PACKAGE_META = "SELECT * from S3Object o WHERE o.version IS NOT MISSING LIMIT 1"
# No WHERE clause needed for aggregations since S3 Select skips missing fields for aggs
SELECT_PACKAGE_STATS = "SELECT SUM(obj['size']) as total_bytes, COUNT(obj['size']) as total_files from S3Object obj"
TEST_EVENT = "s3:TestEvent"
# we need to filter out GetObject and HeadObject calls generated by the present
#  lambda in order to display accurate analytics in the Quilt catalog
#  a custom user agent enables said filtration
USER_AGENT_EXTRA = " quilt3-lambdas-es-indexer"


def now_like_boto3():
    """ensure timezone UTC for consistency with boto3:
    Example of what boto3 returns on head_object:
        'LastModified': datetime.datetime(2019, 11, 6, 3, 1, 16, tzinfo=tzutc()),
    """
Exemplo n.º 4
0
sending to elastic search in memory-limited batches"""
from datetime import datetime
from enum import Enum
from math import floor
from typing import Dict, List
import os

from aws_requests_auth.aws_auth import AWSRequestsAuth
import boto3
from elasticsearch import Elasticsearch, RequestsHttpConnection
from elasticsearch.helpers import bulk

from t4_lambda_shared.utils import separated_env_to_iter
from t4_lambda_shared.preview import ELASTIC_LIMIT_BYTES

CONTENT_INDEX_EXTS = separated_env_to_iter("CONTENT_INDEX_EXTS") or {
    ".csv", ".ipynb", ".json", ".md", ".parquet", ".rmd", ".tsv", ".txt"
}

EVENT_PREFIX = {"Created": "ObjectCreated:", "Removed": "ObjectRemoved:"}

# See https://amzn.to/2xJpngN for chunk size as a function of container size
CHUNK_LIMIT_BYTES = int(os.getenv('CHUNK_LIMIT_BYTES') or 9_500_000)
ELASTIC_TIMEOUT = 30
MAX_BACKOFF = 360  # seconds
MAX_RETRY = 4  # prevent long-running lambdas due to malformed calls
# signifies that the object is truly deleted, not to be confused with
# s3:ObjectRemoved:DeleteMarkerCreated, which we may see in versioned buckets
# see https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html
QUEUE_LIMIT_BYTES = 100_000_000  # 100MB
RETRY_429 = 5