def __init__(self, request, batch_field='@id', batch_size=5000): self.request = request self.batch_field = batch_field self.batch_size = batch_size self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.batch_param_values = self.param_list.get(batch_field, []).copy()
def batch_download_factory(context, request): qs = QueryString(request) specified_type = qs.get_one_value(params=qs.get_type_filters()) if specified_type == 'PublicationData': return _get_publication_data_batch_download(context, request) else: return _get_batch_download(context, request)
def item_view_object_with_select_calculated_properties(context, request): properties = item_links(context, request) qs = QueryString(request) select_properties = qs.param_values_to_list(params=qs.get_field_filters()) calculated = calculate_select_properties( context, request, ns=properties, select_properties=select_properties) properties.update(calculated) return properties
class Cart: ''' Pass either a request with a query string with `?cart=foo&cart=bar` params or a list of uuids (@ids also work): * `cart = Cart(request)` or `cart = Cart(request, uuids=['xyz'])` * `cart.elements` return all elements in the cart(s) * `cart.as_params()` return [('@id', '/elements/xyz')] tuples for use in filters Can use max_cart_elements to limit total number of elements allowed in carts. Default is no limit. ''' def __init__(self, request, uuids=None, max_cart_elements=None): self.request = request self.query_string = QueryString(request) self.uuids = uuids or [] self.max_cart_elements = max_cart_elements self._elements = [] def _get_carts_from_params(self): return self.query_string.param_values_to_list( params=self.query_string.get_cart()) def _get_cart_object_or_error(self, uuid): return self.request.embed(uuid, '@@object') def _try_to_get_cart_object(self, uuid): try: cart = self._get_cart_object_or_error(uuid) except KeyError: cart = {} return cart def _try_to_get_elements_from_cart(self, uuid): cart = self._try_to_get_cart_object(uuid) return cart.get('elements', []) def _get_elements_from_carts(self): carts = self.uuids or self._get_carts_from_params() for cart in carts: yield from self._try_to_get_elements_from_cart(cart) def _validate_cart_size(self): if self.max_cart_elements is not None and len( self._elements) > self.max_cart_elements: raise HTTPBadRequest(explanation=( f'Too many elements in cart ' f'(total {len(self._elements)} > max {self.max_cart_elements})' )) @property def elements(self): if not self._elements: self._elements = sorted(set(self._get_elements_from_carts())) self._validate_cart_size() yield from self._elements def as_params(self): return [('@id', at_id) for at_id in self.elements]
def metadata_report_factory(context, request): qs = QueryString(request) specified_type = qs.get_one_value(params=qs.get_type_filters()) if specified_type == 'Annotation': return _get_annotation_metadata(context, request) elif specified_type == 'PublicationData': return _get_publication_data_metadata(context, request) else: return _get_metadata(context, request)
def wrapper(context, request): qs = QueryString(request) type_filters = qs.get_type_filters() if len(type_filters) != 1: raise HTTPBadRequest( explanation='URL requires one type parameter.' ) if type_filters[0][1] not in types: raise HTTPBadRequest( explanation=f'{type_filters[0][1]} not a valid type for endpoint.' ) return func(context, request)
def __init__(self, request): self.request = request self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.positive_file_param_set = {} self.header = [] self.experiment_column_to_fields_mapping = OrderedDict() self.file_column_to_fields_mapping = OrderedDict() self.visualizable_only = self.query_string.is_param( 'option', 'visualizable') self.raw_only = self.query_string.is_param('option', 'raw') self.csv = CSVGenerator()
def test_searches_parsers_query_string__repr__(dummy_request): from snovault.elasticsearch.searches.parsers import QueryString dummy_request.query_string = 'type=Snowflake&type!=Snowball' qs = QueryString(dummy_request) assert str(qs) == 'type=Snowflake&type%21=Snowball' dummy_request.query_string = 'type=Snowball&status!=revoked&files.file_type=bed+bed6%2B' qs = QueryString(dummy_request) assert str( qs) == 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B' dummy_request.query_string = 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B' qs = QueryString(dummy_request) assert str( qs) == 'type=Snowball&status%21=revoked&files.file_type=bed+bed6%2B'
class BatchedSearchGenerator: SEARCH_PATH = '/search/' DEFAULT_PARAMS = [('limit', 'all')] def __init__(self, request, batch_field='@id', batch_size=5000): self.request = request self.batch_field = batch_field self.batch_size = batch_size self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.batch_param_values = self.param_list.get(batch_field, []).copy() def _make_batched_values_from_batch_param_values(self): end = len(self.batch_param_values) for start in range(0, end, self.batch_size): yield self.batch_param_values[start:min(start + self.batch_size, end)] def _make_batched_params_from_batched_values(self, batched_values): return [(self.batch_field, batched_value) for batched_value in batched_values] def _build_new_request(self, batched_params): self.query_string.drop('limit') self.query_string.drop(self.batch_field) self.query_string.extend(batched_params + self.DEFAULT_PARAMS) request = self.query_string.get_request_with_new_query_string() request.path_info = self.SEARCH_PATH request.registry = self.request.registry return request def results(self): if not self.batch_param_values: yield from search_generator(self._build_new_request([]))['@graph'] for batched_values in self._make_batched_values_from_batch_param_values( ): batched_params = self._make_batched_params_from_batched_values( batched_values) request = self._build_new_request(batched_params) yield from search_generator(request)['@graph']
class MetadataReport: SEARCH_PATH = '/search/' EXCLUDED_COLUMNS = ( 'Restricted', 'No File Available', ) DEFAULT_PARAMS = [ ('field', 'audit'), ('field', 'files.@id'), ('field', 'files.restricted'), ('field', 'files.no_file_available'), ('field', 'files.file_format'), ('field', 'files.file_format_type'), ('field', 'files.status'), ('field', 'files.assembly'), ('limit', 'all'), ] CONTENT_TYPE = 'text/tsv' CONTENT_DISPOSITION = 'attachment; filename="metadata.tsv"' def __init__(self, request): self.request = request self.query_string = QueryString(request) self.param_list = self.query_string.group_values_by_key() self.positive_file_param_set = {} self.header = [] self.experiment_column_to_fields_mapping = OrderedDict() self.file_column_to_fields_mapping = OrderedDict() self.visualizable_only = self.query_string.is_param( 'option', 'visualizable') self.raw_only = self.query_string.is_param('option', 'raw') self.csv = CSVGenerator() def _get_column_to_fields_mapping(self): return METADATA_COLUMN_TO_FIELDS_MAPPING def _build_header(self): for column in self._get_column_to_fields_mapping(): if column not in self.EXCLUDED_COLUMNS: self.header.append(column) for audit, column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING: self.header.append(column) def _split_column_and_fields_by_experiment_and_file(self): for column, fields in self._get_column_to_fields_mapping().items(): if fields[0].startswith('files'): self.file_column_to_fields_mapping[column] = [ field.replace('files.', '') for field in fields ] else: self.experiment_column_to_fields_mapping[column] = fields def _set_positive_file_param_set(self): self.positive_file_param_set = { k.replace('files.', ''): set(map_strings_to_booleans_and_ints(v)) for k, v in self.param_list.items() if k.startswith('files.') and '!' not in k } def _add_positive_file_filters_as_fields_to_param_list(self): self.param_list['field'] = self.param_list.get('field', []) self.param_list['field'].extend( (k for k, v in self.query_string._get_original_params() if k.startswith('files.') and '!' not in k)) def _add_fields_to_param_list(self): self.param_list['field'] = self.param_list.get('field', []) for column, fields in self._get_column_to_fields_mapping().items(): self.param_list['field'].extend(fields) self._add_positive_file_filters_as_fields_to_param_list() def _initialize_at_id_param(self): self.param_list['@id'] = self.param_list.get('@id', []) def _maybe_add_cart_elements_to_param_list(self): # Don't need to limit max_cart_elements here since # search is batched. cart = CartWithElements(self.request, max_cart_elements=None) self.param_list['@id'].extend(cart.elements) self.param_list.pop('cart', None) def _get_json_elements_or_empty_list(self): try: return self.request.json.get('elements', []) except ValueError: return [] def _maybe_add_json_elements_to_param_list(self): self.param_list['@id'].extend(self._get_json_elements_or_empty_list()) def _get_field_params(self): return [('field', p) for p in self.param_list.get('field', [])] def _get_at_id_params(self): return [('@id', p) for p in self.param_list.get('@id', [])] def _get_default_params(self): return self.DEFAULT_PARAMS def _build_query_string(self): self.query_string.drop('limit') self.query_string.drop('option') self.query_string.extend(self._get_default_params() + self._get_field_params() + self._get_at_id_params()) def _get_search_path(self): return self.SEARCH_PATH def _build_new_request(self): self._build_query_string() request = self.query_string.get_request_with_new_query_string() request.path_info = self._get_search_path() return request def _get_search_results_generator(self): return BatchedSearchGenerator(self._build_new_request()).results() def _should_not_report_file(self, file_): conditions = [ not file_matches_file_params(file_, self.positive_file_param_set), self.visualizable_only and not is_file_visualizable(file_), self.raw_only and file_.get('assembly'), file_.get('restricted'), file_.get('no_file_available'), ] return any(conditions) def _get_experiment_data(self, experiment): return { column: make_experiment_cell(fields, experiment) for column, fields in self.experiment_column_to_fields_mapping.items() } def _get_file_data(self, file_): file_['href'] = self.request.host_url + file_['href'] return { column: make_file_cell(fields, file_) for column, fields in self.file_column_to_fields_mapping.items() } def _get_audit_data(self, grouped_audits_for_file, grouped_other_audits): return { audit_column: ', '.join( set( grouped_audits_for_file.get(audit_type, []) + grouped_other_audits.get(audit_type, []))) for audit_type, audit_column in METADATA_AUDIT_TO_AUDIT_COLUMN_MAPPING } def _output_sorted_row(self, experiment_data, file_data): row = [] for column in self.header: row.append(file_data.get(column, experiment_data.get(column))) return row def _generate_rows(self): yield self.csv.writerow(self.header) for experiment in self._get_search_results_generator(): if not experiment.get('files', []): continue grouped_file_audits, grouped_other_audits = group_audits_by_files_and_type( experiment.get('audit', {})) experiment_data = self._get_experiment_data(experiment) for file_ in experiment.get('files', []): if self._should_not_report_file(file_): continue file_data = self._get_file_data(file_) audit_data = self._get_audit_data( grouped_file_audits.get(file_.get('@id'), {}), grouped_other_audits) file_data.update(audit_data) yield self.csv.writerow( self._output_sorted_row(experiment_data, file_data)) def _validate_request(self): type_params = self.param_list.get('type', []) if len(type_params) != 1: raise HTTPBadRequest( explanation='URL requires one "type" parameter.') return True def _initialize_report(self): self._build_header() self._split_column_and_fields_by_experiment_and_file() self._set_positive_file_param_set() def _build_params(self): self._add_fields_to_param_list() self._initialize_at_id_param() self._maybe_add_cart_elements_to_param_list() self._maybe_add_json_elements_to_param_list() def generate(self): self._validate_request() self._initialize_report() self._build_params() return Response( content_type=self.CONTENT_TYPE, app_iter=self._generate_rows(), content_disposition=self.CONTENT_DISPOSITION, )
def batch_download(context, request): default_params = [ ('limit', 'all'), ('field', 'files.href'), ('field', 'files.restricted'), ('field', 'files.file_format'), ('field', 'files.file_format_type'), ('field', 'files.status'), ('field', 'files.assembly'), ] qs = QueryString(request) param_list = qs.group_values_by_key() file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition( key_and_value_condition=lambda k, _: k.startswith('files.'))) # Process PublicationData batch downloads separately. type_param = param_list.get('type', [''])[0] if type_param and type_param.lower() == 'publicationdata': return _batch_download_publicationdata(request) file_fields = [('field', k) for k in file_filters] qs.drop('limit') type_param = param_list.get('type', [''])[0] cart_uuids = param_list.get('cart', []) # Only allow specific type= query-string values, or cart=. if not type_param and not cart_uuids: raise HTTPBadRequest( explanation='URL must include a "type" or "cart" parameter.') if not type_param.lower() in _allowed_types: raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'. format(type_param)) # Check for the "visualizable" and/or "raw" options in the query string for file filtering. visualizable_only = qs.is_param('option', 'visualizable') raw_only = qs.is_param('option', 'raw') qs.drop('option') qs.extend(default_params + file_fields) experiments = [] if request.method == 'POST': metadata_link = '' cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart')) try: elements = request.json.get('elements', []) except ValueError: elements = [] if cart_uuid: try: request.embed(cart_uuid, '@@object') except KeyError: raise HTTPBadRequest( explanation='Specified cart does not exist.') # metadata.tsv link includes a cart UUID metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) else: metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format( host_url=request.host_url, search_params=qs._get_original_query_string(), elements_json=','.join('"{0}"'.format(element) for element in elements)) # Because of potential number of datasets in the cart, break search # into multiple searches of ELEMENT_CHUNK_SIZE datasets each. for i in range(0, len(elements), ELEMENT_CHUNK_SIZE): qs.drop('@id') qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]]) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments.extend(results['@graph']) else: # Make sure regular batch download doesn't include a cart parameter; error if it does. if cart_uuids: raise HTTPBadRequest( explanation= 'You must download cart file manifests from the portal.') # Regular batch download has single simple call to request.embed metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments = results['@graph'] exp_files = (exp_file for exp in experiments for exp_file in exp.get('files', [])) files = [metadata_link] param_list = qs.group_values_by_key() for exp_file in exp_files: if not files_prop_param_list(exp_file, param_list): continue elif visualizable_only and not is_file_visualizable(exp_file): continue elif raw_only and exp_file.get('assembly'): # "raw" option only allows files w/o assembly. continue elif restricted_files_present(exp_file): continue files.append('{host_url}{href}'.format( host_url=request.host_url, href=exp_file['href'], )) return Response(content_type='text/plain', body='\n'.join(files), content_disposition='attachment; filename="%s"' % 'files.txt')
def metadata_tsv(context, request): qs = QueryString(request) param_list = qs.group_values_by_key() if 'referrer' in param_list: search_path = '/{}/'.format(param_list.pop('referrer')[0]) else: search_path = '/search/' type_param = param_list.get('type', [''])[0] cart_uuids = param_list.get('cart', []) # Only allow specific type= query-string values, or cart=. if not type_param and not cart_uuids: raise HTTPBadRequest( explanation='URL must include a "type" or "cart" parameter.') if not type_param.lower() in _allowed_types: raise HTTPBadRequest(explanation='"{}" not a valid type for metadata'. format(type_param)) # Handle special-case metadata.tsv generation. if type_param: if type_param.lower() == 'annotation': return _get_annotation_metadata(request, search_path, param_list) if type_param.lower() == 'publicationdata': return _get_publicationdata_metadata(request) param_list['field'] = [] header = [] file_attributes = [] for prop in _tsv_mapping: if prop not in _excluded_columns: header.append(prop) if _tsv_mapping[prop][0].startswith('files'): file_attributes = file_attributes + [_tsv_mapping[prop][0]] param_list['field'] = param_list['field'] + _tsv_mapping[prop] # Handle metadata.tsv lines from cart-generated files.txt. if cart_uuids: # metadata.tsv line includes cart UUID, so load the specified cart and # get its "elements" property for a list of items to retrieve. cart_uuid = cart_uuids.pop() del param_list['cart'] try: cart = request.embed(cart_uuid, '@@object') except KeyError: raise HTTPBadRequest(explanation='Specified cart does not exist.') else: if cart.get('elements'): param_list['@id'] = cart['elements'] else: # If the metadata.tsv line includes a JSON payload, get its "elements" # property for a list of items to retrieve. try: elements = request.json.get('elements') except ValueError: pass else: param_list['@id'] = elements default_params = [ ('field', 'audit'), ('limit', 'all'), ] field_params = [('field', p) for p in param_list.get('field', [])] at_id_params = [('@id', p) for p in param_list.get('@id', [])] qs.drop('limit') # Check for the "visualizable" and/or "raw" options in the query string for file filtering. visualizable_only = qs.is_param('option', 'visualizable') raw_only = qs.is_param('option', 'raw') qs.drop('option') qs.extend(default_params + field_params + at_id_params) path = '{}?{}'.format(search_path, str(qs)) results = request.embed(quote(path), as_user=True) rows = [] for experiment_json in results['@graph']: if experiment_json.get('files', []): exp_data_row = [] for column in header: if not _tsv_mapping[column][0].startswith('files'): make_cell(column, experiment_json, exp_data_row) f_attributes = [ 'files.title', 'files.file_type', 'files.file_format', 'files.file_format_type', 'files.output_type', 'files.assembly' ] for f in experiment_json['files']: if not files_prop_param_list(f, param_list): continue if visualizable_only and not is_file_visualizable(f): continue if raw_only and f.get('assembly'): # "raw" option only allows files w/o assembly. continue if restricted_files_present(f): continue if is_no_file_available(f): continue f['href'] = request.host_url + f['href'] f_row = [] for attr in f_attributes: f_row.append(f.get(attr[6:], '')) data_row = f_row + exp_data_row for prop in file_attributes: if prop in f_attributes: continue path = prop[6:] temp = [] for value in simple_path_ids(f, path): temp.append(str(value)) if prop == 'files.replicate.rbns_protein_concentration': if 'replicate' in f and 'rbns_protein_concentration_units' in f[ 'replicate']: temp[0] = temp[0] + ' ' + f['replicate'][ 'rbns_protein_concentration_units'] if prop in ['files.paired_with', 'files.derived_from']: # chopping of path to just accession if len(temp): new_values = [t[7:-1] for t in temp] temp = new_values data = list(set(temp)) data.sort() data_row.append(', '.join(data)) audit_info = [ make_audit_cell(audit_type, experiment_json, f) for audit_type in _audit_mapping ] data_row.extend(audit_info) rows.append(data_row) fout = io.StringIO() writer = csv.writer(fout, delimiter='\t', lineterminator='\n') header.extend([prop for prop in _audit_mapping]) writer.writerow(header) writer.writerows(rows) return Response(content_type='text/tsv', body=fout.getvalue(), content_disposition='attachment;filename="%s"' % 'metadata.tsv')
def _batch_download_publicationdata(request): """ Generate PublicationData files.txt. :param request: Pyramid request """ # Parse the batch_download request query string. qs = QueryString(request) param_list = qs.group_values_by_key() # Get the required "dataset={path}" parameter. dataset_path = param_list.get('dataset', [''])[0] # Retrieve the files property of the requested PublicationData object. object = request.embed(dataset_path, as_user=True) file_ids = object.get('files', []) # Generate the metadata link that heads the file. metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) # Generate the content of files.txt starting with the metadata.tsv download line and then each # file's download URL. files = [metadata_link] dataset_type = '' if file_ids: for file_id in file_ids: # Request individual file object from its path. file = request.embed(file_id, as_user=True) # All file datasets need to belong to the same type of dataset. if dataset_type: # See if subsequent dataset types match the first one we found. file_dataset_type_match = type_re.match(file.get( 'dataset', '')) if file_dataset_type_match and file_dataset_type_match.group( 1) != dataset_type: raise HTTPBadRequest( explanation='File dataset types must be homogeneous') else: # Establish the first dataset type we find. dataset_type_match = type_re.match(file.get('dataset', '')) if dataset_type_match: dataset_type = dataset_type_match.group(1) # Other disqualifying conditions. if restricted_files_present(file): continue if is_no_file_available(file): continue # Finally append file to files.txt. files.append('{host_url}{href}'.format(host_url=request.host_url, href=file['href'])) # Initiate the files.txt download. return Response(content_type='text/plain', body='\n'.join(files), content_disposition='attachment; filename="%s"' % 'files.txt')
def _get_publicationdata_metadata(request): """ Generate PublicationData metadata.tsv. :param request: Pyramid request """ qs = QueryString(request) param_list = qs.group_values_by_key() # Get the required "dataset={path}" parameter. dataset_path = param_list.get('dataset', [''])[0] # Open the metadata.tsv file for writing. fout = io.StringIO() writer = csv.writer(fout, delimiter='\t') # Build the column-title header row and write it to the file. header = [ header for header in _tsv_mapping_publicationdata if header not in _excluded_columns ] writer.writerow(header) # Load the specified PublicationData object and extract its files to build the rows. dataset = request.embed(dataset_path, as_user=True) file_ids = dataset.get('files', []) if file_ids: for file_id in file_ids: # Load the file object and disqualify those we don't handle. file = request.embed(file_id, as_user=True) # Load the file object and disqualify those we don't handle. biosample_ontology = file.get('biosample_ontology', {}) if restricted_files_present(file): continue if is_no_file_available(file): continue # Extract the file's dataset accession from the @id; avoids loading the dataset object. dataset_accession = '' accession_match = accession_re.match(file.get('dataset', '')) if accession_match: dataset_accession = accession_match.group(1) # Extract the file's derived_from accessions from their @id. derived_from_accessions = [] derived_from_file_ids = file.get('derived_from', '') for derived_from_file_id in derived_from_file_ids: accession_match = accession_re.match(derived_from_file_id) if accession_match: derived_from_accessions.append(accession_match.group(1)) # Build the row's data; must sync with _tsv_mapping_publicationdata. row = [ file.get('title', ''), dataset_accession, file.get('file_format', ''), file.get('file_type', ''), file.get('output_type', ''), file.get('assay_term_name', ''), biosample_ontology.get('term_id'), biosample_ontology.get('term_name'), biosample_ontology.get('classification'), file.get('target', {}).get('label', ''), dataset.get('accession', ''), dataset.get('date_released', ''), dataset.get('award', {}).get('project', ''), file.get('lab', {}).get('title', ''), file.get('md5sum', ''), ', '.join(file.get('dbxrefs', '')), file.get('href', ''), file.get('assembly', ''), file.get('status', ''), ', '.join(derived_from_accessions), file.get('cloud_metadata', {}).get('url', ''), file.get('file_size', ''), ] writer.writerow(row) # All rows collected; write to the metadata.tsv file and download. return Response(content_type='text/tsv', body=fout.getvalue(), content_disposition='attachment;filename="%s"' % 'metadata.tsv')
def __init__(self, request, uuids=None, max_cart_elements=None): self.request = request self.query_string = QueryString(request) self.uuids = uuids or [] self.max_cart_elements = max_cart_elements self._elements = []
def test_searches_parsers_query_string_init(dummy_request): from snovault.elasticsearch.searches.parsers import QueryString qs = QueryString(dummy_request) assert isinstance(qs, QueryString)
def batch_download(context, request): default_params = [('limit', 'all'), ('field', 'files.href'), ('field', 'files.restricted')] qs = QueryString(request) file_filters = qs.param_keys_to_list(params=qs.get_filters_by_condition( key_and_value_condition=lambda k, _: k.startswith('files.'))) file_fields = [('field', k) for k in file_filters] qs.drop('limit') qs.extend(default_params + file_fields) experiments = [] error_message = None if request.method == 'POST': metadata_link = '' cart_uuid = qs.get_one_value(params=qs.get_key_filters(key='cart')) try: elements = request.json.get('elements', []) except ValueError: elements = [] if cart_uuid: # metadata.tsv link includes a cart UUID metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) else: metadata_link = '{host_url}/metadata/?{search_params} -X GET -H "Accept: text/tsv" -H "Content-Type: application/json" --data \'{{"elements": [{elements_json}]}}\''.format( host_url=request.host_url, search_params=qs._get_original_query_string(), elements_json=','.join('"{0}"'.format(element) for element in elements)) # Because of potential number of datasets in the cart, break search # into multiple searches of ELEMENT_CHUNK_SIZE datasets each. for i in range(0, len(elements), ELEMENT_CHUNK_SIZE): qs.drop('@id') qs.extend([('@id', e) for e in elements[i:i + ELEMENT_CHUNK_SIZE]]) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments.extend(results['@graph']) else: # Regular batch download has single simple call to request.embed metadata_link = '{host_url}/metadata/?{search_params}'.format( host_url=request.host_url, search_params=qs._get_original_query_string()) path = '/search/?{}'.format(str(qs)) results = request.embed(quote(path), as_user=True) experiments = results['@graph'] exp_files = (exp_file for exp in experiments for exp_file in exp.get('files', [])) files = [metadata_link] param_list = qs.group_values_by_key() for exp_file in exp_files: if not files_prop_param_list(exp_file, param_list): continue elif restricted_files_present(exp_file): continue files.append('{host_url}{href}'.format( host_url=request.host_url, href=exp_file['href'], )) return Response(content_type='text/plain', body='\n'.join(files), content_disposition='attachment; filename="%s"' % 'files.txt')
def __init__(self, request): super().__init__(request) self.file_query_string = QueryString(request) self.file_params = [] self.file_at_ids = []
class PublicationDataMetadataReport(MetadataReport): ''' PublicationData objects don't embed file attributes so we have to get file metadata with separate search request. We try to get all the file metadata together in a batched request instead of making a request for every file. This requires some extra machinery compared to normal MetdataReport. ''' DEFAULT_PARAMS = [('limit', 'all'), ('field', 'files')] DEFAULT_FILE_PARAMS = [ ('type', 'File'), ('limit', 'all'), ('field', '@id'), ('field', 'href'), ('field', 'restricted'), ('field', 'no_file_available'), ('field', 'file_format'), ('field', 'file_format_type'), ('field', 'status'), ('field', 'assembly'), ] def __init__(self, request): super().__init__(request) self.file_query_string = QueryString(request) self.file_params = [] self.file_at_ids = [] # Overrides parent. def _get_column_to_fields_mapping(self): return PUBLICATION_DATA_METADATA_COLUMN_TO_FIELDS_MAPPING # Overrides parent. def _build_header(self): for column in self._get_column_to_fields_mapping(): if column not in self.EXCLUDED_COLUMNS: self.header.append(column) # Overrides parent. def _add_fields_to_param_list(self): self.param_list['field'] = [] for column, fields in self.experiment_column_to_fields_mapping.items(): self.param_list['field'].extend(fields) def _add_default_file_params_to_file_params(self): self.file_params.extend(self.DEFAULT_FILE_PARAMS) def _add_report_file_fields_to_file_params(self): for column, fields in self.file_column_to_fields_mapping.items(): self.file_params.extend([('field', field) for field in fields]) def _convert_experiment_params_to_file_params(self): return [(k.replace('files.', ''), v) for k, v in self.query_string._get_original_params() if k.startswith('files.')] def _add_experiment_file_filters_as_fields_to_file_params(self): self.file_params.extend( ('field', k) for k, v in self._convert_experiment_params_to_file_params()) def _add_experiment_file_filters_to_file_params(self): self.file_params.extend( self._convert_experiment_params_to_file_params()) def _build_file_params(self): self._add_default_file_params_to_file_params() self._add_report_file_fields_to_file_params() self._add_experiment_file_filters_as_fields_to_file_params() self._add_experiment_file_filters_to_file_params() def _filter_file_params_from_query_string(self): self.query_string.params = [(k, v) for k, v in self.query_string.params if not k.startswith('files.')] # Overrides parent. def _build_params(self): super()._build_params() self._build_file_params() self._filter_file_params_from_query_string() def _get_at_id_file_params(self): return [('@id', file_at_id) for file_at_id in self.file_at_ids] def _build_new_file_request(self): self.file_query_string.params = (self.file_params + self._get_at_id_file_params()) request = self.file_query_string.get_request_with_new_query_string() request.path_info = self._get_search_path() return request def _get_file_search_results_generator(self): request = self._build_new_file_request() bsg = BatchedSearchGenerator(request) return bsg.results() # Overrides parent. def _generate_rows(self): yield self.csv.writerow(self.header) for experiment in self._get_search_results_generator(): self.file_at_ids = experiment.get('files', []) if not self.file_at_ids: continue experiment_data = self._get_experiment_data(experiment) for file_ in self._get_file_search_results_generator(): if self._should_not_report_file(file_): continue file_data = self._get_file_data(file_) yield self.csv.writerow( self._output_sorted_row(experiment_data, file_data))