예제 #1
0
 def __init__(self, request, batch_field='@id', batch_size=5000):
     self.request = request
     self.batch_field = batch_field
     self.batch_size = batch_size
     self.query_string = QueryString(request)
     self.param_list = self.query_string.group_values_by_key()
     self.batch_param_values = self.param_list.get(batch_field, []).copy()
예제 #2
0
def batch_download_factory(context, request):
    qs = QueryString(request)
    specified_type = qs.get_one_value(params=qs.get_type_filters())
    if specified_type == 'PublicationData':
        return _get_publication_data_batch_download(context, request)
    else:
        return _get_batch_download(context, request)
예제 #3
0
def item_view_object_with_select_calculated_properties(context, request):
    properties = item_links(context, request)
    qs = QueryString(request)
    select_properties = qs.param_values_to_list(params=qs.get_field_filters())
    calculated = calculate_select_properties(
        context, request, ns=properties, select_properties=select_properties)
    properties.update(calculated)
    return properties
예제 #4
0
class Cart:
    '''
    Pass either a request with a query string with `?cart=foo&cart=bar` params
    or a list of uuids (@ids also work):
    * `cart = Cart(request)` or `cart = Cart(request, uuids=['xyz'])`
    * `cart.elements` return all elements in the cart(s)
    * `cart.as_params()` return [('@id', '/elements/xyz')] tuples for use in filters
    Can use max_cart_elements to limit total number of elements allowed in carts.
    Default is no limit.
    '''
    def __init__(self, request, uuids=None, max_cart_elements=None):
        self.request = request
        self.query_string = QueryString(request)
        self.uuids = uuids or []
        self.max_cart_elements = max_cart_elements
        self._elements = []

    def _get_carts_from_params(self):
        return self.query_string.param_values_to_list(
            params=self.query_string.get_cart())

    def _get_cart_object_or_error(self, uuid):
        return self.request.embed(uuid, '@@object')

    def _try_to_get_cart_object(self, uuid):
        try:
            cart = self._get_cart_object_or_error(uuid)
        except KeyError:
            cart = {}
        return cart

    def _try_to_get_elements_from_cart(self, uuid):
        cart = self._try_to_get_cart_object(uuid)
        return cart.get('elements', [])

    def _get_elements_from_carts(self):
        carts = self.uuids or self._get_carts_from_params()
        for cart in carts:
            yield from self._try_to_get_elements_from_cart(cart)

    def _validate_cart_size(self):
        if self.max_cart_elements is not None and len(
                self._elements) > self.max_cart_elements:
            raise HTTPBadRequest(explanation=(
                f'Too many elements in cart '
                f'(total {len(self._elements)} > max {self.max_cart_elements})'
            ))

    @property
    def elements(self):
        if not self._elements:
            self._elements = sorted(set(self._get_elements_from_carts()))
        self._validate_cart_size()
        yield from self._elements

    def as_params(self):
        return [('@id', at_id) for at_id in self.elements]
예제 #5
0
def metadata_report_factory(context, request):
    qs = QueryString(request)
    specified_type = qs.get_one_value(params=qs.get_type_filters())
    if specified_type == 'Annotation':
        return _get_annotation_metadata(context, request)
    elif specified_type == 'PublicationData':
        return _get_publication_data_metadata(context, request)
    else:
        return _get_metadata(context, request)
예제 #6
0
 def wrapper(context, request):
     qs = QueryString(request)
     type_filters = qs.get_type_filters()
     if len(type_filters) != 1:
         raise HTTPBadRequest(
             explanation='URL requires one type parameter.'
         )
     if type_filters[0][1] not in types:
         raise HTTPBadRequest(
             explanation=f'{type_filters[0][1]} not a valid type for endpoint.'
         )
     return func(context, request)
예제 #7
0
 def __init__(self, request):
     self.request = request
     self.query_string = QueryString(request)
     self.param_list = self.query_string.group_values_by_key()
     self.positive_file_param_set = {}
     self.header = []
     self.experiment_column_to_fields_mapping = OrderedDict()
     self.file_column_to_fields_mapping = OrderedDict()
     self.visualizable_only = self.query_string.is_param(
         'option', 'visualizable')
     self.raw_only = self.query_string.is_param('option', 'raw')
     self.csv = CSVGenerator()
예제 #8
0
def test_searches_parsers_query_string__repr__(dummy_request):
    from snovault.elasticsearch.searches.parsers import QueryString
    dummy_request.query_string = 'type=Snowflake&type!=Snowball'
    qs = QueryString(dummy_request)
    assert str(qs) == 'type=Snowflake&type%21=Snowball'
    dummy_request.query_string = 'type=Snowball&status!=revoked&files.file_type=bed+bed6%2B'
    qs = QueryString(dummy_request)
    assert str(
        qs) == 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B'
    dummy_request.query_string = 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B'
    qs = QueryString(dummy_request)
    assert str(
        qs) == 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B'
예제 #9
0
class BatchedSearchGenerator:

    SEARCH_PATH = '/search/'
    DEFAULT_PARAMS = [('limit', 'all')]

    def __init__(self, request, batch_field='@id', batch_size=5000):
        self.request = request
        self.batch_field = batch_field
        self.batch_size = batch_size
        self.query_string = QueryString(request)
        self.param_list = self.query_string.group_values_by_key()
        self.batch_param_values = self.param_list.get(batch_field, []).copy()

    def _make_batched_values_from_batch_param_values(self):
        end = len(self.batch_param_values)
        for start in range(0, end, self.batch_size):
            yield self.batch_param_values[start:min(start +
                                                    self.batch_size, end)]

    def _make_batched_params_from_batched_values(self, batched_values):
        return [(self.batch_field, batched_value)
                for batched_value in batched_values]

    def _build_new_request(self, batched_params):
        self.query_string.drop('limit')
        self.query_string.drop(self.batch_field)
        self.query_string.extend(batched_params + self.DEFAULT_PARAMS)
        request = self.query_string.get_request_with_new_query_string()
        request.path_info = self.SEARCH_PATH
        request.registry = self.request.registry
        return request

    def results(self):
        if not self.batch_param_values:
            yield from search_generator(self._build_new_request([]))['@graph']
        for batched_values in self._make_batched_values_from_batch_param_values(
        ):
            batched_params = self._make_batched_params_from_batched_values(
                batched_values)
            request = self._build_new_request(batched_params)
            yield from search_generator(request)['@graph']
예제 #10
0
class MetadataReport:

    SEARCH_PATH = '/search/'
    EXCLUDED_COLUMNS = (
        'Restricted',
        'No File Available',
    )
    DEFAULT_PARAMS = [
        ('field', 'audit'),
        ('field', 'files.@id'),
        ('field', 'files.restricted'),
        ('field', 'files.no_file_available'),
        ('field', 'files.file_format'),
        ('field', 'files.file_format_type'),
        ('field', 'files.status'),
        ('field', 'files.assembly'),
        ('limit', 'all'),
    ]
    CONTENT_TYPE = 'text/tsv'
    CONTENT_DISPOSITION = 'attachment; filename="metadata.tsv"'

    def __init__(self, request):
        self.request = request
        self.query_string = QueryString(request)
        self.param_list = self.query_string.group_values_by_key()
        self.positive_file_param_set = {}
        self.header = []
        self.experiment_column_to_fields_mapping = OrderedDict()
        self.file_column_to_fields_mapping = OrderedDict()
        self.visualizable_only = self.query_string.is_param(
            'option', 'visualizable')
        self.raw_only = self.query_string.is_param('option', 'raw')
        self.csv = CSVGenerator()

    def _get_column_to_fields_mapping(self):
        return METADATA_COLUMN_TO_FIELDS_MAPPING

    def _build_header(self):
        for column in self._get_column_to_fields_mapping():
            if column not in self.EXCLUDED_COLUMNS:
                self.header.append(column)
        for audit, column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING:
            self.header.append(column)

    def _split_column_and_fields_by_experiment_and_file(self):
        for column, fields in self._get_column_to_fields_mapping().items():
            if fields[0].startswith('files'):
                self.file_column_to_fields_mapping[column] = [
                    field.replace('files.', '') for field in fields
                ]
            else:
                self.experiment_column_to_fields_mapping[column] = fields

    def _set_positive_file_param_set(self):
        self.positive_file_param_set = {
            k.replace('files.', ''): set(map_strings_to_booleans_and_ints(v))
            for k, v in self.param_list.items()
            if k.startswith('files.') and '!' not in k
        }

    def _add_positive_file_filters_as_fields_to_param_list(self):
        self.param_list['field'] = self.param_list.get('field', [])
        self.param_list['field'].extend(
            (k for k, v in self.query_string._get_original_params()
             if k.startswith('files.') and '!' not in k))

    def _add_fields_to_param_list(self):
        self.param_list['field'] = self.param_list.get('field', [])
        for column, fields in self._get_column_to_fields_mapping().items():
            self.param_list['field'].extend(fields)
        self._add_positive_file_filters_as_fields_to_param_list()

    def _initialize_at_id_param(self):
        self.param_list['@id'] = self.param_list.get('@id', [])

    def _maybe_add_cart_elements_to_param_list(self):
        # Don't need to limit max_cart_elements here since
        # search is batched.
        cart = CartWithElements(self.request, max_cart_elements=None)
        self.param_list['@id'].extend(cart.elements)
        self.param_list.pop('cart', None)

    def _get_json_elements_or_empty_list(self):
        try:
            return self.request.json.get('elements', [])
        except ValueError:
            return []

    def _maybe_add_json_elements_to_param_list(self):
        self.param_list['@id'].extend(self._get_json_elements_or_empty_list())

    def _get_field_params(self):
        return [('field', p) for p in self.param_list.get('field', [])]

    def _get_at_id_params(self):
        return [('@id', p) for p in self.param_list.get('@id', [])]

    def _get_default_params(self):
        return self.DEFAULT_PARAMS

    def _build_query_string(self):
        self.query_string.drop('limit')
        self.query_string.drop('option')
        self.query_string.extend(self._get_default_params() +
                                 self._get_field_params() +
                                 self._get_at_id_params())

    def _get_search_path(self):
        return self.SEARCH_PATH

    def _build_new_request(self):
        self._build_query_string()
        request = self.query_string.get_request_with_new_query_string()
        request.path_info = self._get_search_path()
        return request

    def _get_search_results_generator(self):
        return BatchedSearchGenerator(self._build_new_request()).results()

    def _should_not_report_file(self, file_):
        conditions = [
            not file_matches_file_params(file_, self.positive_file_param_set),
            self.visualizable_only and not is_file_visualizable(file_),
            self.raw_only and file_.get('assembly'),
            file_.get('restricted'),
            file_.get('no_file_available'),
        ]
        return any(conditions)

    def _get_experiment_data(self, experiment):
        return {
            column: make_experiment_cell(fields, experiment)
            for column, fields in
            self.experiment_column_to_fields_mapping.items()
        }

    def _get_file_data(self, file_):
        file_['href'] = self.request.host_url + file_['href']
        return {
            column: make_file_cell(fields, file_)
            for column, fields in self.file_column_to_fields_mapping.items()
        }

    def _get_audit_data(self, grouped_audits_for_file, grouped_other_audits):
        return {
            audit_column: ', '.join(
                set(
                    grouped_audits_for_file.get(audit_type, []) +
                    grouped_other_audits.get(audit_type, [])))
            for audit_type, audit_column in
            METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING
        }

    def _output_sorted_row(self, experiment_data, file_data):
        row = []
        for column in self.header:
            row.append(file_data.get(column, experiment_data.get(column)))
        return row

    def _generate_rows(self):
        yield self.csv.writerow(self.header)
        for experiment in self._get_search_results_generator():
            if not experiment.get('files', []):
                continue
            grouped_file_audits, grouped_other_audits = group_audits_by_files_and_type(
                experiment.get('audit', {}))
            experiment_data = self._get_experiment_data(experiment)
            for file_ in experiment.get('files', []):
                if self._should_not_report_file(file_):
                    continue
                file_data = self._get_file_data(file_)
                audit_data = self._get_audit_data(
                    grouped_file_audits.get(file_.get('@id'), {}),
                    grouped_other_audits)
                file_data.update(audit_data)
                yield self.csv.writerow(
                    self._output_sorted_row(experiment_data, file_data))

    def _validate_request(self):
        type_params = self.param_list.get('type', [])
        if len(type_params) != 1:
            raise HTTPBadRequest(
                explanation='URL requires one "type" parameter.')
        return True

    def _initialize_report(self):
        self._build_header()
        self._split_column_and_fields_by_experiment_and_file()
        self._set_positive_file_param_set()

    def _build_params(self):
        self._add_fields_to_param_list()
        self._initialize_at_id_param()
        self._maybe_add_cart_elements_to_param_list()
        self._maybe_add_json_elements_to_param_list()

    def generate(self):
        self._validate_request()
        self._initialize_report()
        self._build_params()
        return Response(
            content_type=self.CONTENT_TYPE,
            app_iter=self._generate_rows(),
            content_disposition=self.CONTENT_DISPOSITION,
        )
예제 #11
0
def batch_download(context, request):
    default_params = [
        ('limit', 'all'),
        ('field', 'files.href'),
        ('field', 'files.restricted'),
        ('field', 'files.file_format'),
        ('field', 'files.file_format_type'),
        ('field', 'files.status'),
        ('field', 'files.assembly'),
    ]
    qs = QueryString(request)
    param_list = qs.group_values_by_key()
    file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition(
        key_and_value_condition=lambda k, _: k.startswith('files.')))

    # Process PublicationData batch downloads separately.
    type_param = param_list.get('type', [''])[0]
    if type_param and type_param.lower() == 'publicationdata':
        return _batch_download_publicationdata(request)

    file_fields = [('field', k) for k in file_filters]
    qs.drop('limit')
    type_param = param_list.get('type', [''])[0]
    cart_uuids = param_list.get('cart', [])

    # Only allow specific type= query-string values, or cart=.
    if not type_param and not cart_uuids:
        raise HTTPBadRequest(
            explanation='URL must include a "type" or "cart" parameter.')
    if not type_param.lower() in _allowed_types:
        raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'.
                             format(type_param))

    # Check for the "visualizable" and/or "raw" options in the query string for file filtering.
    visualizable_only = qs.is_param('option', 'visualizable')
    raw_only = qs.is_param('option', 'raw')
    qs.drop('option')

    qs.extend(default_params + file_fields)
    experiments = []
    if request.method == 'POST':
        metadata_link = ''
        cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart'))
        try:
            elements = request.json.get('elements', [])
        except ValueError:
            elements = []

        if cart_uuid:
            try:
                request.embed(cart_uuid, '@@object')
            except KeyError:
                raise HTTPBadRequest(
                    explanation='Specified cart does not exist.')

            # metadata.tsv link includes a cart UUID
            metadata_link = '{host_url}/metadata/?{search_params}'.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string())
        else:
            metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string(),
                elements_json=','.join('"{0}"'.format(element)
                                       for element in elements))

        # Because of potential number of datasets in the cart, break search
        # into multiple searches of ELEMENT_CHUNK_SIZE datasets each.
        for i in range(0, len(elements), ELEMENT_CHUNK_SIZE):
            qs.drop('@id')
            qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]])
            path = '/search/?{}'.format(str(qs))
            results = request.embed(quote(path), as_user=True)
            experiments.extend(results['@graph'])
    else:
        # Make sure regular batch download doesn't include a cart parameter; error if it does.
        if cart_uuids:
            raise HTTPBadRequest(
                explanation=
                'You must download cart file manifests from the portal.')

        # Regular batch download has single simple call to request.embed
        metadata_link = '{host_url}/metadata/?{search_params}'.format(
            host_url=request.host_url,
            search_params=qs._get_original_query_string())
        path = '/search/?{}'.format(str(qs))
        results = request.embed(quote(path), as_user=True)
        experiments = results['@graph']

    exp_files = (exp_file for exp in experiments
                 for exp_file in exp.get('files', []))

    files = [metadata_link]
    param_list = qs.group_values_by_key()
    for exp_file in exp_files:
        if not files_prop_param_list(exp_file, param_list):
            continue
        elif visualizable_only and not is_file_visualizable(exp_file):
            continue
        elif raw_only and exp_file.get('assembly'):
            # "raw" option only allows files w/o assembly.
            continue
        elif restricted_files_present(exp_file):
            continue
        files.append('{host_url}{href}'.format(
            host_url=request.host_url,
            href=exp_file['href'],
        ))

    return Response(content_type='text/plain',
                    body='\n'.join(files),
                    content_disposition='attachment; filename="%s"' %
                    'files.txt')
예제 #12
0
def metadata_tsv(context, request):
    qs = QueryString(request)
    param_list = qs.group_values_by_key()
    if 'referrer' in param_list:
        search_path = '/{}/'.format(param_list.pop('referrer')[0])
    else:
        search_path = '/search/'
    type_param = param_list.get('type', [''])[0]
    cart_uuids = param_list.get('cart', [])

    # Only allow specific type= query-string values, or cart=.
    if not type_param and not cart_uuids:
        raise HTTPBadRequest(
            explanation='URL must include a "type" or "cart" parameter.')
    if not type_param.lower() in _allowed_types:
        raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'.
                             format(type_param))

    # Handle special-case metadata.tsv generation.
    if type_param:
        if type_param.lower() == 'annotation':
            return _get_annotation_metadata(request, search_path, param_list)
        if type_param.lower() == 'publicationdata':
            return _get_publicationdata_metadata(request)

    param_list['field'] = []
    header = []
    file_attributes = []
    for prop in _tsv_mapping:
        if prop not in _excluded_columns:
            header.append(prop)
            if _tsv_mapping[prop][0].startswith('files'):
                file_attributes = file_attributes + [_tsv_mapping[prop][0]]
        param_list['field'] = param_list['field'] + _tsv_mapping[prop]

    # Handle metadata.tsv lines from cart-generated files.txt.
    if cart_uuids:
        # metadata.tsv line includes cart UUID, so load the specified cart and
        # get its "elements" property for a list of items to retrieve.
        cart_uuid = cart_uuids.pop()
        del param_list['cart']
        try:
            cart = request.embed(cart_uuid, '@@object')
        except KeyError:
            raise HTTPBadRequest(explanation='Specified cart does not exist.')
        else:
            if cart.get('elements'):
                param_list['@id'] = cart['elements']
    else:
        # If the metadata.tsv line includes a JSON payload, get its "elements"
        # property for a list of items to retrieve.
        try:
            elements = request.json.get('elements')
        except ValueError:
            pass
        else:
            param_list['@id'] = elements
    default_params = [
        ('field', 'audit'),
        ('limit', 'all'),
    ]
    field_params = [('field', p) for p in param_list.get('field', [])]
    at_id_params = [('@id', p) for p in param_list.get('@id', [])]
    qs.drop('limit')

    # Check for the "visualizable" and/or "raw" options in the query string for file filtering.
    visualizable_only = qs.is_param('option', 'visualizable')
    raw_only = qs.is_param('option', 'raw')
    qs.drop('option')

    qs.extend(default_params + field_params + at_id_params)
    path = '{}?{}'.format(search_path, str(qs))
    results = request.embed(quote(path), as_user=True)
    rows = []
    for experiment_json in results['@graph']:
        if experiment_json.get('files', []):
            exp_data_row = []
            for column in header:
                if not _tsv_mapping[column][0].startswith('files'):
                    make_cell(column, experiment_json, exp_data_row)

            f_attributes = [
                'files.title', 'files.file_type', 'files.file_format',
                'files.file_format_type', 'files.output_type', 'files.assembly'
            ]

            for f in experiment_json['files']:
                if not files_prop_param_list(f, param_list):
                    continue
                if visualizable_only and not is_file_visualizable(f):
                    continue
                if raw_only and f.get('assembly'):
                    # "raw" option only allows files w/o assembly.
                    continue
                if restricted_files_present(f):
                    continue
                if is_no_file_available(f):
                    continue
                f['href'] = request.host_url + f['href']
                f_row = []
                for attr in f_attributes:
                    f_row.append(f.get(attr[6:], ''))
                data_row = f_row + exp_data_row
                for prop in file_attributes:
                    if prop in f_attributes:
                        continue
                    path = prop[6:]
                    temp = []
                    for value in simple_path_ids(f, path):
                        temp.append(str(value))
                    if prop == 'files.replicate.rbns_protein_concentration':
                        if 'replicate' in f and 'rbns_protein_concentration_units' in f[
                                'replicate']:
                            temp[0] = temp[0] + ' ' + f['replicate'][
                                'rbns_protein_concentration_units']
                    if prop in ['files.paired_with', 'files.derived_from']:
                        # chopping of path to just accession
                        if len(temp):
                            new_values = [t[7:-1] for t in temp]
                            temp = new_values
                    data = list(set(temp))
                    data.sort()
                    data_row.append(', '.join(data))
                audit_info = [
                    make_audit_cell(audit_type, experiment_json, f)
                    for audit_type in _audit_mapping
                ]
                data_row.extend(audit_info)
                rows.append(data_row)
    fout = io.StringIO()
    writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
    header.extend([prop for prop in _audit_mapping])
    writer.writerow(header)
    writer.writerows(rows)
    return Response(content_type='text/tsv',
                    body=fout.getvalue(),
                    content_disposition='attachment;filename="%s"' %
                    'metadata.tsv')
예제 #13
0
def _batch_download_publicationdata(request):
    """
    Generate PublicationData files.txt.

        :param request: Pyramid request
    """

    # Parse the batch_download request query string.
    qs = QueryString(request)
    param_list = qs.group_values_by_key()

    # Get the required "dataset={path}" parameter.
    dataset_path = param_list.get('dataset', [''])[0]

    # Retrieve the files property of the requested PublicationData object.
    object = request.embed(dataset_path, as_user=True)
    file_ids = object.get('files', [])

    # Generate the metadata link that heads the file.
    metadata_link = '{host_url}/metadata/?{search_params}'.format(
        host_url=request.host_url,
        search_params=qs._get_original_query_string())

    # Generate the content of files.txt starting with the metadata.tsv download line and then each
    # file's download URL.
    files = [metadata_link]
    dataset_type = ''
    if file_ids:
        for file_id in file_ids:
            # Request individual file object from its path.
            file = request.embed(file_id, as_user=True)

            # All file datasets need to belong to the same type of dataset.
            if dataset_type:
                # See if subsequent dataset types match the first one we found.
                file_dataset_type_match = type_re.match(file.get(
                    'dataset', ''))
                if file_dataset_type_match and file_dataset_type_match.group(
                        1) != dataset_type:
                    raise HTTPBadRequest(
                        explanation='File dataset types must be homogeneous')
            else:
                # Establish the first dataset type we find.
                dataset_type_match = type_re.match(file.get('dataset', ''))
                if dataset_type_match:
                    dataset_type = dataset_type_match.group(1)

            # Other disqualifying conditions.
            if restricted_files_present(file):
                continue
            if is_no_file_available(file):
                continue

            # Finally append file to files.txt.
            files.append('{host_url}{href}'.format(host_url=request.host_url,
                                                   href=file['href']))

    # Initiate the files.txt download.
    return Response(content_type='text/plain',
                    body='\n'.join(files),
                    content_disposition='attachment; filename="%s"' %
                    'files.txt')
예제 #14
0
def _get_publicationdata_metadata(request):
    """
    Generate PublicationData metadata.tsv.

        :param request: Pyramid request
    """
    qs = QueryString(request)
    param_list = qs.group_values_by_key()

    # Get the required "dataset={path}" parameter.
    dataset_path = param_list.get('dataset', [''])[0]

    # Open the metadata.tsv file for writing.
    fout = io.StringIO()
    writer = csv.writer(fout, delimiter='\t')

    # Build the column-title header row and write it to the file.
    header = [
        header for header in _tsv_mapping_publicationdata
        if header not in _excluded_columns
    ]
    writer.writerow(header)

    # Load the specified PublicationData object and extract its files to build the rows.
    dataset = request.embed(dataset_path, as_user=True)
    file_ids = dataset.get('files', [])
    if file_ids:
        for file_id in file_ids:
            # Load the file object and disqualify those we don't handle.
            file = request.embed(file_id, as_user=True)

            # Load the file object and disqualify those we don't handle.
            biosample_ontology = file.get('biosample_ontology', {})
            if restricted_files_present(file):
                continue
            if is_no_file_available(file):
                continue

            # Extract the file's dataset accession from the @id; avoids loading the dataset object.
            dataset_accession = ''
            accession_match = accession_re.match(file.get('dataset', ''))
            if accession_match:
                dataset_accession = accession_match.group(1)

            # Extract the file's derived_from accessions from their @id.
            derived_from_accessions = []
            derived_from_file_ids = file.get('derived_from', '')
            for derived_from_file_id in derived_from_file_ids:
                accession_match = accession_re.match(derived_from_file_id)
                if accession_match:
                    derived_from_accessions.append(accession_match.group(1))

            # Build the row's data; must sync with _tsv_mapping_publicationdata.
            row = [
                file.get('title', ''),
                dataset_accession,
                file.get('file_format', ''),
                file.get('file_type', ''),
                file.get('output_type', ''),
                file.get('assay_term_name', ''),
                biosample_ontology.get('term_id'),
                biosample_ontology.get('term_name'),
                biosample_ontology.get('classification'),
                file.get('target', {}).get('label', ''),
                dataset.get('accession', ''),
                dataset.get('date_released', ''),
                dataset.get('award', {}).get('project', ''),
                file.get('lab', {}).get('title', ''),
                file.get('md5sum', ''),
                ', '.join(file.get('dbxrefs', '')),
                file.get('href', ''),
                file.get('assembly', ''),
                file.get('status', ''),
                ', '.join(derived_from_accessions),
                file.get('cloud_metadata', {}).get('url', ''),
                file.get('file_size', ''),
            ]
            writer.writerow(row)

        # All rows collected; write to the metadata.tsv file and download.
        return Response(content_type='text/tsv',
                        body=fout.getvalue(),
                        content_disposition='attachment;filename="%s"' %
                        'metadata.tsv')
예제 #15
0
 def __init__(self, request, uuids=None, max_cart_elements=None):
     self.request = request
     self.query_string = QueryString(request)
     self.uuids = uuids or []
     self.max_cart_elements = max_cart_elements
     self._elements = []
예제 #16
0
def test_searches_parsers_query_string_init(dummy_request):
    from snovault.elasticsearch.searches.parsers import QueryString
    qs = QueryString(dummy_request)
    assert isinstance(qs, QueryString)
예제 #17
0
def batch_download(context, request):
    default_params = [('limit', 'all'), ('field', 'files.href'),
                      ('field', 'files.restricted')]
    qs = QueryString(request)
    file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition(
        key_and_value_condition=lambda k, _: k.startswith('files.')))
    file_fields = [('field', k) for k in file_filters]
    qs.drop('limit')
    qs.extend(default_params + file_fields)
    experiments = []
    error_message = None
    if request.method == 'POST':
        metadata_link = ''
        cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart'))
        try:
            elements = request.json.get('elements', [])
        except ValueError:
            elements = []
        if cart_uuid:
            # metadata.tsv link includes a cart UUID
            metadata_link = '{host_url}/metadata/?{search_params}'.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string())
        else:
            metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string(),
                elements_json=','.join('"{0}"'.format(element)
                                       for element in elements))

        # Because of potential number of datasets in the cart, break search
        # into multiple searches of ELEMENT_CHUNK_SIZE datasets each.
        for i in range(0, len(elements), ELEMENT_CHUNK_SIZE):
            qs.drop('@id')
            qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]])
            path = '/search/?{}'.format(str(qs))
            results = request.embed(quote(path), as_user=True)
            experiments.extend(results['@graph'])
    else:
        # Regular batch download has single simple call to request.embed
        metadata_link = '{host_url}/metadata/?{search_params}'.format(
            host_url=request.host_url,
            search_params=qs._get_original_query_string())
        path = '/search/?{}'.format(str(qs))
        results = request.embed(quote(path), as_user=True)
        experiments = results['@graph']

    exp_files = (exp_file for exp in experiments
                 for exp_file in exp.get('files', []))

    files = [metadata_link]
    param_list = qs.group_values_by_key()
    for exp_file in exp_files:
        if not files_prop_param_list(exp_file, param_list):
            continue
        elif restricted_files_present(exp_file):
            continue
        files.append('{host_url}{href}'.format(
            host_url=request.host_url,
            href=exp_file['href'],
        ))

    return Response(content_type='text/plain',
                    body='\n'.join(files),
                    content_disposition='attachment; filename="%s"' %
                    'files.txt')
예제 #18
0
 def __init__(self, request):
     super().__init__(request)
     self.file_query_string = QueryString(request)
     self.file_params = []
     self.file_at_ids = []
예제 #19
0
class PublicationDataMetadataReport(MetadataReport):
    '''
    PublicationData objects don't embed file attributes so
    we have to get file metadata with separate search request.
    We try to get all the file metadata together in a batched request
    instead of making a request for every file. This requires some
    extra machinery compared to normal MetdataReport.
    '''

    DEFAULT_PARAMS = [('limit', 'all'), ('field', 'files')]
    DEFAULT_FILE_PARAMS = [
        ('type', 'File'),
        ('limit', 'all'),
        ('field', '@id'),
        ('field', 'href'),
        ('field', 'restricted'),
        ('field', 'no_file_available'),
        ('field', 'file_format'),
        ('field', 'file_format_type'),
        ('field', 'status'),
        ('field', 'assembly'),
    ]

    def __init__(self, request):
        super().__init__(request)
        self.file_query_string = QueryString(request)
        self.file_params = []
        self.file_at_ids = []

    # Overrides parent.
    def _get_column_to_fields_mapping(self):
        return PUBLICATION_DATA_METADATA_COLUMN_TO_FIELDS_MAPPING

    # Overrides parent.
    def _build_header(self):
        for column in self._get_column_to_fields_mapping():
            if column not in self.EXCLUDED_COLUMNS:
                self.header.append(column)

    # Overrides parent.
    def _add_fields_to_param_list(self):
        self.param_list['field'] = []
        for column, fields in self.experiment_column_to_fields_mapping.items():
            self.param_list['field'].extend(fields)

    def _add_default_file_params_to_file_params(self):
        self.file_params.extend(self.DEFAULT_FILE_PARAMS)

    def _add_report_file_fields_to_file_params(self):
        for column, fields in self.file_column_to_fields_mapping.items():
            self.file_params.extend([('field', field) for field in fields])

    def _convert_experiment_params_to_file_params(self):
        return [(k.replace('files.', ''), v)
                for k, v in self.query_string._get_original_params()
                if k.startswith('files.')]

    def _add_experiment_file_filters_as_fields_to_file_params(self):
        self.file_params.extend(
            ('field', k)
            for k, v in self._convert_experiment_params_to_file_params())

    def _add_experiment_file_filters_to_file_params(self):
        self.file_params.extend(
            self._convert_experiment_params_to_file_params())

    def _build_file_params(self):
        self._add_default_file_params_to_file_params()
        self._add_report_file_fields_to_file_params()
        self._add_experiment_file_filters_as_fields_to_file_params()
        self._add_experiment_file_filters_to_file_params()

    def _filter_file_params_from_query_string(self):
        self.query_string.params = [(k, v) for k, v in self.query_string.params
                                    if not k.startswith('files.')]

    # Overrides parent.
    def _build_params(self):
        super()._build_params()
        self._build_file_params()
        self._filter_file_params_from_query_string()

    def _get_at_id_file_params(self):
        return [('@id', file_at_id) for file_at_id in self.file_at_ids]

    def _build_new_file_request(self):
        self.file_query_string.params = (self.file_params +
                                         self._get_at_id_file_params())
        request = self.file_query_string.get_request_with_new_query_string()
        request.path_info = self._get_search_path()
        return request

    def _get_file_search_results_generator(self):
        request = self._build_new_file_request()
        bsg = BatchedSearchGenerator(request)
        return bsg.results()

    # Overrides parent.
    def _generate_rows(self):
        yield self.csv.writerow(self.header)
        for experiment in self._get_search_results_generator():
            self.file_at_ids = experiment.get('files', [])
            if not self.file_at_ids:
                continue
            experiment_data = self._get_experiment_data(experiment)
            for file_ in self._get_file_search_results_generator():
                if self._should_not_report_file(file_):
                    continue
                file_data = self._get_file_data(file_)
                yield self.csv.writerow(
                    self._output_sorted_row(experiment_data, file_data))