예제 #1
0
class BatchedSearchGenerator:

    SEARCH_PATH = '/search/'
    DEFAULT_PARAMS = [('limit', 'all')]

    def __init__(self, request, batch_field='@id', batch_size=5000):
        self.request = request
        self.batch_field = batch_field
        self.batch_size = batch_size
        self.query_string = QueryString(request)
        self.param_list = self.query_string.group_values_by_key()
        self.batch_param_values = self.param_list.get(batch_field, []).copy()

    def _make_batched_values_from_batch_param_values(self):
        end = len(self.batch_param_values)
        for start in range(0, end, self.batch_size):
            yield self.batch_param_values[start:min(start +
                                                    self.batch_size, end)]

    def _make_batched_params_from_batched_values(self, batched_values):
        return [(self.batch_field, batched_value)
                for batched_value in batched_values]

    def _build_new_request(self, batched_params):
        self.query_string.drop('limit')
        self.query_string.drop(self.batch_field)
        self.query_string.extend(batched_params + self.DEFAULT_PARAMS)
        request = self.query_string.get_request_with_new_query_string()
        request.path_info = self.SEARCH_PATH
        request.registry = self.request.registry
        return request

    def results(self):
        if not self.batch_param_values:
            yield from search_generator(self._build_new_request([]))['@graph']
        for batched_values in self._make_batched_values_from_batch_param_values(
        ):
            batched_params = self._make_batched_params_from_batched_values(
                batched_values)
            request = self._build_new_request(batched_params)
            yield from search_generator(request)['@graph']
예제 #2
0
def batch_download(context, request):
    default_params = [
        ('limit', 'all'),
        ('field', 'files.href'),
        ('field', 'files.restricted'),
        ('field', 'files.file_format'),
        ('field', 'files.file_format_type'),
        ('field', 'files.status'),
        ('field', 'files.assembly'),
    ]
    qs = QueryString(request)
    param_list = qs.group_values_by_key()
    file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition(
        key_and_value_condition=lambda k, _: k.startswith('files.')))

    # Process PublicationData batch downloads separately.
    type_param = param_list.get('type', [''])[0]
    if type_param and type_param.lower() == 'publicationdata':
        return _batch_download_publicationdata(request)

    file_fields = [('field', k) for k in file_filters]
    qs.drop('limit')
    type_param = param_list.get('type', [''])[0]
    cart_uuids = param_list.get('cart', [])

    # Only allow specific type= query-string values, or cart=.
    if not type_param and not cart_uuids:
        raise HTTPBadRequest(
            explanation='URL must include a "type" or "cart" parameter.')
    if not type_param.lower() in _allowed_types:
        raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'.
                             format(type_param))

    # Check for the "visualizable" and/or "raw" options in the query string for file filtering.
    visualizable_only = qs.is_param('option', 'visualizable')
    raw_only = qs.is_param('option', 'raw')
    qs.drop('option')

    qs.extend(default_params + file_fields)
    experiments = []
    if request.method == 'POST':
        metadata_link = ''
        cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart'))
        try:
            elements = request.json.get('elements', [])
        except ValueError:
            elements = []

        if cart_uuid:
            try:
                request.embed(cart_uuid, '@@object')
            except KeyError:
                raise HTTPBadRequest(
                    explanation='Specified cart does not exist.')

            # metadata.tsv link includes a cart UUID
            metadata_link = '{host_url}/metadata/?{search_params}'.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string())
        else:
            metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string(),
                elements_json=','.join('"{0}"'.format(element)
                                       for element in elements))

        # Because of potential number of datasets in the cart, break search
        # into multiple searches of ELEMENT_CHUNK_SIZE datasets each.
        for i in range(0, len(elements), ELEMENT_CHUNK_SIZE):
            qs.drop('@id')
            qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]])
            path = '/search/?{}'.format(str(qs))
            results = request.embed(quote(path), as_user=True)
            experiments.extend(results['@graph'])
    else:
        # Make sure regular batch download doesn't include a cart parameter; error if it does.
        if cart_uuids:
            raise HTTPBadRequest(
                explanation=
                'You must download cart file manifests from the portal.')

        # Regular batch download has single simple call to request.embed
        metadata_link = '{host_url}/metadata/?{search_params}'.format(
            host_url=request.host_url,
            search_params=qs._get_original_query_string())
        path = '/search/?{}'.format(str(qs))
        results = request.embed(quote(path), as_user=True)
        experiments = results['@graph']

    exp_files = (exp_file for exp in experiments
                 for exp_file in exp.get('files', []))

    files = [metadata_link]
    param_list = qs.group_values_by_key()
    for exp_file in exp_files:
        if not files_prop_param_list(exp_file, param_list):
            continue
        elif visualizable_only and not is_file_visualizable(exp_file):
            continue
        elif raw_only and exp_file.get('assembly'):
            # "raw" option only allows files w/o assembly.
            continue
        elif restricted_files_present(exp_file):
            continue
        files.append('{host_url}{href}'.format(
            host_url=request.host_url,
            href=exp_file['href'],
        ))

    return Response(content_type='text/plain',
                    body='\n'.join(files),
                    content_disposition='attachment; filename="%s"' %
                    'files.txt')
예제 #3
0
def metadata_tsv(context, request):
    qs = QueryString(request)
    param_list = qs.group_values_by_key()
    if 'referrer' in param_list:
        search_path = '/{}/'.format(param_list.pop('referrer')[0])
    else:
        search_path = '/search/'
    type_param = param_list.get('type', [''])[0]
    cart_uuids = param_list.get('cart', [])

    # Only allow specific type= query-string values, or cart=.
    if not type_param and not cart_uuids:
        raise HTTPBadRequest(
            explanation='URL must include a "type" or "cart" parameter.')
    if not type_param.lower() in _allowed_types:
        raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'.
                             format(type_param))

    # Handle special-case metadata.tsv generation.
    if type_param:
        if type_param.lower() == 'annotation':
            return _get_annotation_metadata(request, search_path, param_list)
        if type_param.lower() == 'publicationdata':
            return _get_publicationdata_metadata(request)

    param_list['field'] = []
    header = []
    file_attributes = []
    for prop in _tsv_mapping:
        if prop not in _excluded_columns:
            header.append(prop)
            if _tsv_mapping[prop][0].startswith('files'):
                file_attributes = file_attributes + [_tsv_mapping[prop][0]]
        param_list['field'] = param_list['field'] + _tsv_mapping[prop]

    # Handle metadata.tsv lines from cart-generated files.txt.
    if cart_uuids:
        # metadata.tsv line includes cart UUID, so load the specified cart and
        # get its "elements" property for a list of items to retrieve.
        cart_uuid = cart_uuids.pop()
        del param_list['cart']
        try:
            cart = request.embed(cart_uuid, '@@object')
        except KeyError:
            raise HTTPBadRequest(explanation='Specified cart does not exist.')
        else:
            if cart.get('elements'):
                param_list['@id'] = cart['elements']
    else:
        # If the metadata.tsv line includes a JSON payload, get its "elements"
        # property for a list of items to retrieve.
        try:
            elements = request.json.get('elements')
        except ValueError:
            pass
        else:
            param_list['@id'] = elements
    default_params = [
        ('field', 'audit'),
        ('limit', 'all'),
    ]
    field_params = [('field', p) for p in param_list.get('field', [])]
    at_id_params = [('@id', p) for p in param_list.get('@id', [])]
    qs.drop('limit')

    # Check for the "visualizable" and/or "raw" options in the query string for file filtering.
    visualizable_only = qs.is_param('option', 'visualizable')
    raw_only = qs.is_param('option', 'raw')
    qs.drop('option')

    qs.extend(default_params + field_params + at_id_params)
    path = '{}?{}'.format(search_path, str(qs))
    results = request.embed(quote(path), as_user=True)
    rows = []
    for experiment_json in results['@graph']:
        if experiment_json.get('files', []):
            exp_data_row = []
            for column in header:
                if not _tsv_mapping[column][0].startswith('files'):
                    make_cell(column, experiment_json, exp_data_row)

            f_attributes = [
                'files.title', 'files.file_type', 'files.file_format',
                'files.file_format_type', 'files.output_type', 'files.assembly'
            ]

            for f in experiment_json['files']:
                if not files_prop_param_list(f, param_list):
                    continue
                if visualizable_only and not is_file_visualizable(f):
                    continue
                if raw_only and f.get('assembly'):
                    # "raw" option only allows files w/o assembly.
                    continue
                if restricted_files_present(f):
                    continue
                if is_no_file_available(f):
                    continue
                f['href'] = request.host_url + f['href']
                f_row = []
                for attr in f_attributes:
                    f_row.append(f.get(attr[6:], ''))
                data_row = f_row + exp_data_row
                for prop in file_attributes:
                    if prop in f_attributes:
                        continue
                    path = prop[6:]
                    temp = []
                    for value in simple_path_ids(f, path):
                        temp.append(str(value))
                    if prop == 'files.replicate.rbns_protein_concentration':
                        if 'replicate' in f and 'rbns_protein_concentration_units' in f[
                                'replicate']:
                            temp[0] = temp[0] + ' ' + f['replicate'][
                                'rbns_protein_concentration_units']
                    if prop in ['files.paired_with', 'files.derived_from']:
                        # chopping of path to just accession
                        if len(temp):
                            new_values = [t[7:-1] for t in temp]
                            temp = new_values
                    data = list(set(temp))
                    data.sort()
                    data_row.append(', '.join(data))
                audit_info = [
                    make_audit_cell(audit_type, experiment_json, f)
                    for audit_type in _audit_mapping
                ]
                data_row.extend(audit_info)
                rows.append(data_row)
    fout = io.StringIO()
    writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
    header.extend([prop for prop in _audit_mapping])
    writer.writerow(header)
    writer.writerows(rows)
    return Response(content_type='text/tsv',
                    body=fout.getvalue(),
                    content_disposition='attachment;filename="%s"' %
                    'metadata.tsv')
예제 #4
0
class MetadataReport:

    SEARCH_PATH = '/search/'
    EXCLUDED_COLUMNS = (
        'Restricted',
        'No File Available',
    )
    DEFAULT_PARAMS = [
        ('field', 'audit'),
        ('field', 'files.@id'),
        ('field', 'files.restricted'),
        ('field', 'files.no_file_available'),
        ('field', 'files.file_format'),
        ('field', 'files.file_format_type'),
        ('field', 'files.status'),
        ('field', 'files.assembly'),
        ('limit', 'all'),
    ]
    CONTENT_TYPE = 'text/tsv'
    CONTENT_DISPOSITION = 'attachment; filename="metadata.tsv"'

    def __init__(self, request):
        self.request = request
        self.query_string = QueryString(request)
        self.param_list = self.query_string.group_values_by_key()
        self.positive_file_param_set = {}
        self.header = []
        self.experiment_column_to_fields_mapping = OrderedDict()
        self.file_column_to_fields_mapping = OrderedDict()
        self.visualizable_only = self.query_string.is_param(
            'option', 'visualizable')
        self.raw_only = self.query_string.is_param('option', 'raw')
        self.csv = CSVGenerator()

    def _get_column_to_fields_mapping(self):
        return METADATA_COLUMN_TO_FIELDS_MAPPING

    def _build_header(self):
        for column in self._get_column_to_fields_mapping():
            if column not in self.EXCLUDED_COLUMNS:
                self.header.append(column)
        for audit, column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING:
            self.header.append(column)

    def _split_column_and_fields_by_experiment_and_file(self):
        for column, fields in self._get_column_to_fields_mapping().items():
            if fields[0].startswith('files'):
                self.file_column_to_fields_mapping[column] = [
                    field.replace('files.', '') for field in fields
                ]
            else:
                self.experiment_column_to_fields_mapping[column] = fields

    def _set_positive_file_param_set(self):
        self.positive_file_param_set = {
            k.replace('files.', ''): set(map_strings_to_booleans_and_ints(v))
            for k, v in self.param_list.items()
            if k.startswith('files.') and '!' not in k
        }

    def _add_positive_file_filters_as_fields_to_param_list(self):
        self.param_list['field'] = self.param_list.get('field', [])
        self.param_list['field'].extend(
            (k for k, v in self.query_string._get_original_params()
             if k.startswith('files.') and '!' not in k))

    def _add_fields_to_param_list(self):
        self.param_list['field'] = self.param_list.get('field', [])
        for column, fields in self._get_column_to_fields_mapping().items():
            self.param_list['field'].extend(fields)
        self._add_positive_file_filters_as_fields_to_param_list()

    def _initialize_at_id_param(self):
        self.param_list['@id'] = self.param_list.get('@id', [])

    def _maybe_add_cart_elements_to_param_list(self):
        # Don't need to limit max_cart_elements here since
        # search is batched.
        cart = CartWithElements(self.request, max_cart_elements=None)
        self.param_list['@id'].extend(cart.elements)
        self.param_list.pop('cart', None)

    def _get_json_elements_or_empty_list(self):
        try:
            return self.request.json.get('elements', [])
        except ValueError:
            return []

    def _maybe_add_json_elements_to_param_list(self):
        self.param_list['@id'].extend(self._get_json_elements_or_empty_list())

    def _get_field_params(self):
        return [('field', p) for p in self.param_list.get('field', [])]

    def _get_at_id_params(self):
        return [('@id', p) for p in self.param_list.get('@id', [])]

    def _get_default_params(self):
        return self.DEFAULT_PARAMS

    def _build_query_string(self):
        self.query_string.drop('limit')
        self.query_string.drop('option')
        self.query_string.extend(self._get_default_params() +
                                 self._get_field_params() +
                                 self._get_at_id_params())

    def _get_search_path(self):
        return self.SEARCH_PATH

    def _build_new_request(self):
        self._build_query_string()
        request = self.query_string.get_request_with_new_query_string()
        request.path_info = self._get_search_path()
        return request

    def _get_search_results_generator(self):
        return BatchedSearchGenerator(self._build_new_request()).results()

    def _should_not_report_file(self, file_):
        conditions = [
            not file_matches_file_params(file_, self.positive_file_param_set),
            self.visualizable_only and not is_file_visualizable(file_),
            self.raw_only and file_.get('assembly'),
            file_.get('restricted'),
            file_.get('no_file_available'),
        ]
        return any(conditions)

    def _get_experiment_data(self, experiment):
        return {
            column: make_experiment_cell(fields, experiment)
            for column, fields in
            self.experiment_column_to_fields_mapping.items()
        }

    def _get_file_data(self, file_):
        file_['href'] = self.request.host_url + file_['href']
        return {
            column: make_file_cell(fields, file_)
            for column, fields in self.file_column_to_fields_mapping.items()
        }

    def _get_audit_data(self, grouped_audits_for_file, grouped_other_audits):
        return {
            audit_column: ', '.join(
                set(
                    grouped_audits_for_file.get(audit_type, []) +
                    grouped_other_audits.get(audit_type, [])))
            for audit_type, audit_column in
            METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING
        }

    def _output_sorted_row(self, experiment_data, file_data):
        row = []
        for column in self.header:
            row.append(file_data.get(column, experiment_data.get(column)))
        return row

    def _generate_rows(self):
        yield self.csv.writerow(self.header)
        for experiment in self._get_search_results_generator():
            if not experiment.get('files', []):
                continue
            grouped_file_audits, grouped_other_audits = group_audits_by_files_and_type(
                experiment.get('audit', {}))
            experiment_data = self._get_experiment_data(experiment)
            for file_ in experiment.get('files', []):
                if self._should_not_report_file(file_):
                    continue
                file_data = self._get_file_data(file_)
                audit_data = self._get_audit_data(
                    grouped_file_audits.get(file_.get('@id'), {}),
                    grouped_other_audits)
                file_data.update(audit_data)
                yield self.csv.writerow(
                    self._output_sorted_row(experiment_data, file_data))

    def _validate_request(self):
        type_params = self.param_list.get('type', [])
        if len(type_params) != 1:
            raise HTTPBadRequest(
                explanation='URL requires one "type" parameter.')
        return True

    def _initialize_report(self):
        self._build_header()
        self._split_column_and_fields_by_experiment_and_file()
        self._set_positive_file_param_set()

    def _build_params(self):
        self._add_fields_to_param_list()
        self._initialize_at_id_param()
        self._maybe_add_cart_elements_to_param_list()
        self._maybe_add_json_elements_to_param_list()

    def generate(self):
        self._validate_request()
        self._initialize_report()
        self._build_params()
        return Response(
            content_type=self.CONTENT_TYPE,
            app_iter=self._generate_rows(),
            content_disposition=self.CONTENT_DISPOSITION,
        )
예제 #5
0
def batch_download(context, request):
    default_params = [('limit', 'all'), ('field', 'files.href'),
                      ('field', 'files.restricted')]
    qs = QueryString(request)
    file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition(
        key_and_value_condition=lambda k, _: k.startswith('files.')))
    file_fields = [('field', k) for k in file_filters]
    qs.drop('limit')
    qs.extend(default_params + file_fields)
    experiments = []
    error_message = None
    if request.method == 'POST':
        metadata_link = ''
        cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart'))
        try:
            elements = request.json.get('elements', [])
        except ValueError:
            elements = []
        if cart_uuid:
            # metadata.tsv link includes a cart UUID
            metadata_link = '{host_url}/metadata/?{search_params}'.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string())
        else:
            metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format(
                host_url=request.host_url,
                search_params=qs._get_original_query_string(),
                elements_json=','.join('"{0}"'.format(element)
                                       for element in elements))

        # Because of potential number of datasets in the cart, break search
        # into multiple searches of ELEMENT_CHUNK_SIZE datasets each.
        for i in range(0, len(elements), ELEMENT_CHUNK_SIZE):
            qs.drop('@id')
            qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]])
            path = '/search/?{}'.format(str(qs))
            results = request.embed(quote(path), as_user=True)
            experiments.extend(results['@graph'])
    else:
        # Regular batch download has single simple call to request.embed
        metadata_link = '{host_url}/metadata/?{search_params}'.format(
            host_url=request.host_url,
            search_params=qs._get_original_query_string())
        path = '/search/?{}'.format(str(qs))
        results = request.embed(quote(path), as_user=True)
        experiments = results['@graph']

    exp_files = (exp_file for exp in experiments
                 for exp_file in exp.get('files', []))

    files = [metadata_link]
    param_list = qs.group_values_by_key()
    for exp_file in exp_files:
        if not files_prop_param_list(exp_file, param_list):
            continue
        elif restricted_files_present(exp_file):
            continue
        files.append('{host_url}{href}'.format(
            host_url=request.host_url,
            href=exp_file['href'],
        ))

    return Response(content_type='text/plain',
                    body='\n'.join(files),
                    content_disposition='attachment; filename="%s"' %
                    'files.txt')