Пример #1
0
def fetch_product_tags(dynamodb, store_product_urls, product_tag=None):
    tag_table = dynamodb.Table(get_table_name('product_tag'))

    index_name = None
    if product_tag is not None:
        index_name = f'{product_tag.name}_idx'

    for sp_url in store_product_urls:
        sp_url = clean_product_url(sp_url)
        start_key = None
        while True:
            key_expr = Key('store_product_url').eq(sp_url)

            query_kwargs = {}
            if start_key is not None:
                query_kwargs['ExclusiveStartKey'] = start_key
            if index_name is not None:
                query_kwargs['IndexName'] = index_name

            results = tag_table.query(KeyConditionExpression=key_expr,
                                      **query_kwargs)
            for item in results['Items']:
                yield item

            start_key = results.get('LastEvaluatedKey')
            if start_key is None:
                break
Пример #2
0
def test_clean_product_url():
    cleaned_url = 'store.com/products/fork'

    assert clean_product_url(cleaned_url) == cleaned_url
    assert clean_product_url(f'www.{cleaned_url}') == cleaned_url
    assert clean_product_url(f'http://{cleaned_url}') == cleaned_url
    assert clean_product_url(f'https://www.{cleaned_url}') == cleaned_url
    assert clean_product_url(f'//www.{cleaned_url}') == cleaned_url
    assert clean_product_url(f'http://{cleaned_url}?arg=1') == cleaned_url
    assert clean_product_url(f'http://{cleaned_url}#fragment') == cleaned_url

    assert clean_product_url(f'xyz.{cleaned_url}') != cleaned_url
Пример #3
0
def delete_product_tags(dynamodb, product_tag, store_product_urls):
    tag_table = dynamodb.Table(get_table_name('product_tag'))

    with tag_table.batch_writer() as batch:
        for sp_url in store_product_urls:
            sp_url = clean_product_url(sp_url)
            batch.delete_item(Key=dict(
                store_product_url=sp_url,
                tag=product_tag.name,
            ))
Пример #4
0
def set_product_tag(dynamodb, store_product_url, product_tag, **attrs):
    tag_table = dynamodb.Table(get_table_name('product_tag'))
    store_product_url = clean_product_url(store_product_url)
    item = {
        'store_product_url': store_product_url,
        'tag': product_tag.name,
        product_tag.name: 1,
    }
    item.update(attrs)
    tag_table.put_item(Item=item)
Пример #5
0
def add_store_product(dynamodb,
                      product_url,
                      store_domain,
                      is_available=True,
                      **attrs):
    product_table = dynamodb.Table(get_table_name('product'))

    store_product_url = clean_product_url(product_url)
    item_data = dict(
        store_product_url=store_product_url,
        full_store_product_url=product_url,
        store_domain=store_domain,
        # "is_available" is stored as a "number" in DynamoDB
        # (required to allow indexing)
        is_available=int(is_available),
        **attrs)

    item_data = parse_store_product_data(item_data)

    # Set "brand domain" if available for new products
    # (for existing products, this is set according to "product_uuid" by bulk
    # data processing job)
    if attrs.get('store_product_brand_domain'):
        item_data['brand_domain'] = attrs.get('store_product_brand_domain')

    try:
        # product does not yet exist in DB, assign a new product ID
        item_data['product_uuid'] = uuid.uuid4().hex

        product_table.put_item(
            Item=item_data,
            ConditionExpression='attribute_not_exists(store_product_url)')
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] != 'ConditionalCheckFailedException':
            raise
        else:
            raise ValueError(
                f'Product with url "{store_product_url}" already exists')

    primary_image_url = None
    if item_data.get('image_urls'):
        primary_image_url = item_data['image_urls'][0]

    # Tag this store product for requiring indexing if it has an image
    if primary_image_url:
        set_product_tag(dynamodb,
                        store_product_url,
                        ProductTag.image_not_indexed,
                        image_url=primary_image_url)
    # Tag this store product for requiring metadata update
    set_product_tag(dynamodb, store_product_url,
                    ProductTag.update_product_meta)
Пример #6
0
def update_store_product(dynamodb, product_url, **attrs):
    product_table = dynamodb.Table(get_table_name('product'))

    store_product_url = clean_product_url(product_url)

    item_data = dict(full_store_product_url=product_url, **attrs)

    image_not_indexed = False
    update_product_meta = False

    old_item_data = product_table.get_item(Key={
        'store_product_url':
        store_product_url
    }, ).get('Item')

    if not old_item_data:
        raise ValueError(
            f'Product with url "{store_product_url}" does not yet exist')

    old_primary_image_url = None
    if old_item_data.get('image_urls'):
        old_primary_image_url = old_item_data['image_urls'][0]
    new_primary_image_url = None
    if item_data.get('image_urls'):
        new_primary_image_url = item_data['image_urls'][0]

    if (new_primary_image_url is not None and
            # Assume query string does not affect image contents and compare image
            # URLs without query string component
            clean_product_url(new_primary_image_url) !=
            clean_product_url(old_primary_image_url)):
        image_not_indexed = True

    if (old_item_data.get('store_product_brand_domain') !=
            item_data.get('store_product_brand_domain')):
        update_product_meta = True

    item_data = parse_store_product_data(item_data, new_item=False)

    update_expression = 'SET {}'.format(', '.join(
        [f'{attr} = :{attr}' for attr in item_data]))
    expression_attribute_values = {
        f':{attr}': value
        for attr, value in item_data.items()
    }
    product_table.update_item(
        Key={'store_product_url': store_product_url},
        UpdateExpression=update_expression,
        ExpressionAttributeValues=expression_attribute_values)

    # flag image for feature extraction and indexing
    if image_not_indexed:
        set_product_tag(dynamodb,
                        store_product_url,
                        ProductTag.image_not_indexed,
                        image_url=new_primary_image_url)

    # flag product metadata to be updated (re-evaluate product "brand domain")
    if update_product_meta:
        set_product_tag(dynamodb, store_product_url,
                        ProductTag.update_product_meta)
Пример #7
0
def get_store_product(dynamodb, product_url):
    product_table = dynamodb.Table(get_table_name('product'))
    store_product_url = clean_product_url(product_url)
    return product_table.get_item(Key={
        'store_product_url': store_product_url
    }).get('Item')