def test_sniff_compressed_dynamic_datatypes_default_on(): # With auto sniffing on, verify the sniffers work and the files match what is expected # when coming from guess_ext. datatypes_registry = example_datatype_registry_for_sample(sniff_compressed_dynamic_datatypes_default=True) fastqsangergz_datatype = datatypes_registry.get_datatype_by_extension('fastqsanger.gz') fname = sniff.get_test_fname('1.fastqsanger.gz') assert fastqsangergz_datatype.sniff(fname) sniff_order = datatypes_registry.sniff_order fname = sniff.get_test_fname('1.fastqsanger.gz') assert sniff.guess_ext(fname, sniff_order) == 'fastqsanger.gz' fname = sniff.get_test_fname('1.fastqsanger.bz2') assert sniff.guess_ext(fname, sniff_order) == 'fastqsanger.bz2'
def test_sniff_compressed_dynamic_datatypes_default_off(): # Redo last tests with auto compressed sniffing disabled and they should not longer result from guess_ext. datatypes_registry = example_datatype_registry_for_sample(sniff_compressed_dynamic_datatypes_default=False) # sniffer still returns True for these files... fastqsangergz_datatype = datatypes_registry.get_datatype_by_extension('fastqsanger.gz') fname = sniff.get_test_fname('1.fastqsanger.gz') assert fastqsangergz_datatype.sniff(fname) # but they don't report as matching the specified sniff_order. sniff_order = datatypes_registry.sniff_order fname = sniff.get_test_fname('1.fastqsanger.gz') assert 'fastq' not in sniff.guess_ext(fname, sniff_order) fname = sniff.get_test_fname('1.fastqsanger.bz2') assert 'fastq' not in sniff.guess_ext(fname, sniff_order)
def test_sniff_compressed_dynamic_datatypes_default_on(): # With auto sniffing on, verify the sniffers work and the files match what is expected # when coming from guess_ext. datatypes_registry = example_datatype_registry_for_sample( sniff_compressed_dynamic_datatypes_default=True) fastqsangergz_datatype = datatypes_registry.get_datatype_by_extension( 'fastqsanger.gz') fname = sniff.get_test_fname('1.fastqsanger.gz') assert fastqsangergz_datatype.sniff(fname) sniff_order = datatypes_registry.sniff_order fname = sniff.get_test_fname('1.fastqsanger.gz') assert sniff.guess_ext(fname, sniff_order) == 'fastqsanger.gz' fname = sniff.get_test_fname('1.fastqsanger.bz2') assert sniff.guess_ext(fname, sniff_order) == 'fastqsanger.bz2'
def test_sniff_compressed_dynamic_datatypes_default_off(): # Redo last tests with auto compressed sniffing disabled and they should not longer result from guess_ext. datatypes_registry = example_datatype_registry_for_sample( sniff_compressed_dynamic_datatypes_default=False) # sniffer still returns True for these files... fastqsangergz_datatype = datatypes_registry.get_datatype_by_extension( 'fastqsanger.gz') fname = sniff.get_test_fname('1.fastqsanger.gz') assert fastqsangergz_datatype.sniff(fname) # but they don't report as matching the specified sniff_order. sniff_order = datatypes_registry.sniff_order fname = sniff.get_test_fname('1.fastqsanger.gz') assert 'fastq' not in sniff.guess_ext(fname, sniff_order) fname = sniff.get_test_fname('1.fastqsanger.bz2') assert 'fastq' not in sniff.guess_ext(fname, sniff_order)
def exec_after_process( app, inp_data, out_data, param_dict, tool=None, stdout=None, stderr=None ): """Verifies the data after the run""" name, data = out_data.items()[0] if data.state == data.states.OK: data.info = data.name if data.extension == 'txt': data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order ) data = app.datatypes_registry.change_datatype( data, data_type ) data.set_peek() data.set_size() data.flush()
def sniff_and_handle_data_type(file_path, datatypes_registry): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ ext = sniff.handle_uploaded_dataset_file(file_path, datatypes_registry) if not ext or ext == "data": is_binary = check_binary(file_path) ext = sniff.guess_ext(file_path, datatypes_registry.sniff_order, is_binary=is_binary) return ext
def handle_sniffable_binary_check(data_type, ext, path, registry): """Return modified values of data_type and ext if sniffable binary encountered. Precondition: check_binary called returned True. """ # Sniff the data type guessed_ext = sniff.guess_ext(path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext return data_type, ext
def detect_datatype(self, trans, dataset_assoc): """Sniff and assign the datatype to a given dataset association (ldda or hda)""" data = trans.sa_session.query(self.model_class).get(dataset_assoc.id) if data.datatype.is_datatype_change_allowed(): if not data.ok_to_edit_metadata(): raise exceptions.ItemAccessibilityException('This dataset is currently being used as input or output. You cannot change datatype until the jobs have completed or you have canceled them.') else: path = data.dataset.file_name is_binary = check_binary(path) datatype = sniff.guess_ext(path, trans.app.datatypes_registry.sniff_order, is_binary=is_binary) trans.app.datatypes_registry.change_datatype(data, datatype) trans.sa_session.flush() self.set_metadata(trans, dataset_assoc) else: raise exceptions.InsufficientPermissionsException(f'Changing datatype "{data.extension}" is not allowed.')
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info ): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance( data.datatype, datatypes.interval.Interval ): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset( data ) trans.app.model.flush() return data
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance(data.datatype, datatypes.interval.Interval): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset(data) trans.app.model.flush() return data
def add_file(dataset, registry, output_path): ext = None compression_type = None line_count = None converted_path = None stdout = None link_data_only_str = dataset.get('link_data_only', 'copy_files') if link_data_only_str not in ['link_to_files', 'copy_files']: raise UploadProblemException( "Invalid setting '%s' for option link_data_only - upload request misconfigured" % link_data_only_str) link_data_only = link_data_only_str == 'link_to_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get( "in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get( 'purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content', True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: dataset.file_type except AttributeError: raise UploadProblemException( 'Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException( 'Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Does the first 1K contain a null? is_binary = check_binary(dataset.path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if is_zip(dataset.path) and not is_single_file_zip(dataset.path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file( dataset.path, registry, ext=dataset.file_type, tmp_prefix='data_id_%s_upload_' % dataset.dataset_id, tmp_dir=output_adjacent_tmpdir(output_path), in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext( dataset.name)[1].lower().lstrip('.'), convert_to_posix_lines=dataset.to_posix_lines, convert_spaces_to_tabs=dataset.space_to_tab, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif dataset.file_type == 'auto': # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress # is enabled os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_binary=is_binary) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == dataset.path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if dataset.file_type != 'auto': datatype = registry.get_datatype_by_extension(dataset.file_type) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=dataset.file_type)) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) # Strip compression extension from name if compression_type and not getattr( datatype, 'compressed', False) and dataset.name.endswith('.' + compression_type): dataset.name = dataset.name[:-len('.' + compression_type)] # Move dataset if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only: # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False. if purge_source or converted_path: try: shutil.move(converted_path or dataset.path, output_path) except OSError as e: # We may not have permission to remove the input if e.errno != errno.EACCES: raise else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % ext info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/ if not link_data_only and datatype and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path) return info
def run_job(self, job): if job.params['type'] == 'init_transfer': # TODO: don't create new downloads on restart. if job.params['protocol'] in ['http', 'https']: results = [] for result in job.params['results'].values(): result['transfer_job'] = self.app.transfer_manager.new(protocol=job.params['protocol'], name=result['name'], datatype=result['datatype'], url=result['url']) results.append(result) elif job.params['protocol'] == 'scp': results = [] result = {} sample_datasets_dict = job.params['sample_datasets_dict'] # sample_datasets_dict looks something like the following. The outer dictionary keys are SampleDataset ids. # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7, # 'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}} for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items(): result = {} result['transfer_job'] = self.app.transfer_manager.new(protocol=job.params['protocol'], host=job.params['host'], user_name=job.params['user_name'], password=job.params['password'], sample_dataset_id=sample_dataset_id, status=sample_dataset_info_dict['status'], name=sample_dataset_info_dict['name'], file_path=sample_dataset_info_dict['file_path'], sample_id=sample_dataset_info_dict['sample_id'], external_service_id=sample_dataset_info_dict['external_service_id'], error_msg=sample_dataset_info_dict['error_msg'], size=sample_dataset_info_dict['size']) results.append(result) self.app.transfer_manager.run([r['transfer_job'] for r in results]) for result in results: transfer_job = result.pop('transfer_job') self.create_job(None, transfer_job_id=transfer_job.id, result=transfer_job.params, sample_id=job.params['sample_id']) # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE self._update_sample_dataset_status(protocol=job.params['protocol'], sample_id=job.params['sample_id'], result_dict=transfer_job.params, new_status=new_status, error_msg='') job.state = self.app.model.DeferredJob.states.OK self.sa_session.add(job) self.sa_session.flush() # TODO: Error handling: failure executing, or errors returned from the manager if job.params['type'] == 'finish_transfer': protocol = job.params['protocol'] # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY if protocol in ['http', 'https']: result_dict = job.params['result'] library_dataset_name = result_dict['name'] extension = result_dict['datatype'] elif protocol in ['scp']: # In this case, job.params will be a dictionary that contains a key named 'result'. The value # of the result key is a dictionary that looks something like: # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed', # 'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2, # 'local_path': '/tmp/kjl2Ss4', 'password': '******', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'} try: tj = self.sa_session.query(self.app.model.TransferJob).get(int(job.params['transfer_job_id'])) result_dict = tj.params result_dict['local_path'] = tj.path except Exception as e: log.error("Updated transfer result unavailable, using old result. Error was: %s" % str(e)) result_dict = job.params['result'] library_dataset_name = result_dict['name'] # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin).. extension = sniff.guess_ext(result_dict['local_path'], sniff_order=self.app.datatypes_registry.sniff_order) self._update_sample_dataset_status(protocol=job.params['protocol'], sample_id=int(job.params['sample_id']), result_dict=result_dict, new_status=new_status, error_msg='') sample = self.sa_session.query(self.app.model.Sample).get(int(job.params['sample_id'])) ld = self.app.model.LibraryDataset(folder=sample.folder, name=library_dataset_name) self.sa_session.add(ld) self.sa_session.flush() self.app.security_agent.copy_library_permissions(FakeTrans(self.app), sample.folder, ld) ldda = self.app.model.LibraryDatasetDatasetAssociation(name=library_dataset_name, extension=extension, dbkey='?', library_dataset=ld, create_dataset=True, sa_session=self.sa_session) ldda.message = 'Transferred by the Data Transfer Plugin' self.sa_session.add(ldda) self.sa_session.flush() ldda.state = ldda.states.QUEUED # flushed in the set property ld.library_dataset_dataset_association_id = ldda.id self.sa_session.add(ld) self.sa_session.flush() try: # Move the dataset from its temporary location shutil.move(job.transfer_job.path, ldda.file_name) ldda.init_meta() for name, spec in ldda.metadata.spec.items(): if name not in ['name', 'info', 'dbkey', 'base_name']: if spec.get('default'): setattr(ldda.metadata, name, spec.unwrap(spec.get('default'))) self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute(self.app.datatypes_registry.set_external_metadata_tool, FakeTrans(self.app, history=sample.history, user=sample.request.user), incoming={'input1': ldda}) ldda.state = ldda.states.OK # TODO: not sure if this flush is necessary self.sa_session.add(ldda) self.sa_session.flush() except Exception as e: log.exception('Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % (str(job.transfer_job.id), str(job.id))) ldda.state = ldda.states.ERROR if sample.workflow: log.debug("\n\nLogging sample mappings as: %s" % sample.workflow['mappings']) log.debug("job.params: %s" % job.params) # We have a workflow. Update all mappings to ldda's, and when the final one is done # execute_workflow with either the provided history, or a new one. sub_done = True rep_done = False for k, v in sample.workflow['mappings'].items(): if 'hda' not in v and v['ds_tag'].startswith('hi|'): sample.workflow['mappings'][k]['hda'] = self.app.security.decode_id(v['ds_tag'][3:]) for key, value in sample.workflow['mappings'].items(): if 'url' in value and value['url'] == job.params['result']['url']: # DBTODO Make sure all ds| mappings get the URL of the dataset, for linking to later. # If this dataset maps to what we just finished, update the ldda id in the sample. sample.workflow['mappings'][key]['ldda'] = ldda.id rep_done = True # DBTODO replace the hi| mappings with the hda here. Just rip off the first three chars. elif 'ldda' not in value and 'hda' not in value: # We're not done if some mappings still don't have ldda or hda mappings. sub_done = False if sub_done and rep_done: if not sample.history: new_history = self.app.model.History(name="New History From %s" % sample.name, user=sample.request.user) self.sa_session.add(new_history) sample.history = new_history self.sa_session.flush() self._execute_workflow(sample) # Check the workflow for substitution done-ness self.sa_session.add(sample) self.sa_session.flush() elif sample.history: # We don't have a workflow, but a history was provided. # No processing, go ahead and chunk everything in the history. if ldda.dataset.state in ['new', 'upload', 'queued', 'running', 'empty', 'discarded']: log.error("Cannot import dataset '%s' to user history since its state is '%s'. " % (ldda.name, ldda.dataset.state)) elif ldda.dataset.state in ['ok', 'error']: ldda.to_history_dataset_association(target_history=sample.history, add_to_history=True) # Finished job.state = self.app.model.DeferredJob.states.OK self.sa_session.add(job) self.sa_session.flush() # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.COMPLETE self._update_sample_dataset_status(protocol=job.params['protocol'], sample_id=int(job.params['sample_id']), result_dict=job.params['result'], new_status=new_status, error_msg='') if sample.datasets and not sample.untransferred_dataset_files: # Update the state of the sample to the sample's request type's final state. new_state = sample.request.type.final_sample_state self._update_sample_state(sample.id, new_state) # Update the state of the request, if possible self._update_request_state(sample.request.id)
def run_job(self, job): if job.params['type'] == 'init_transfer': # TODO: don't create new downloads on restart. if job.params['protocol'] in ['http', 'https']: results = [] for result in job.params['results'].values(): result['transfer_job'] = self.app.transfer_manager.new( protocol=job.params['protocol'], name=result['name'], datatype=result['datatype'], url=result['url']) results.append(result) elif job.params['protocol'] == 'scp': results = [] result = {} sample_datasets_dict = job.params['sample_datasets_dict'] # sample_datasets_dict looks something like the following. The outer dictionary keys are SampleDataset ids. # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7, # 'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}} for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items( ): result = {} result['transfer_job'] = self.app.transfer_manager.new( protocol=job.params['protocol'], host=job.params['host'], user_name=job.params['user_name'], password=job.params['password'], sample_dataset_id=sample_dataset_id, status=sample_dataset_info_dict['status'], name=sample_dataset_info_dict['name'], file_path=sample_dataset_info_dict['file_path'], sample_id=sample_dataset_info_dict['sample_id'], external_service_id=sample_dataset_info_dict[ 'external_service_id'], error_msg=sample_dataset_info_dict['error_msg'], size=sample_dataset_info_dict['size']) results.append(result) self.app.transfer_manager.run([r['transfer_job'] for r in results]) for result in results: transfer_job = result.pop('transfer_job') self.create_job(None, transfer_job_id=transfer_job.id, result=transfer_job.params, sample_id=job.params['sample_id']) # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE self._update_sample_dataset_status( protocol=job.params['protocol'], sample_id=job.params['sample_id'], result_dict=transfer_job.params, new_status=new_status, error_msg='') job.state = self.app.model.DeferredJob.states.OK self.sa_session.add(job) self.sa_session.flush() # TODO: Error handling: failure executing, or errors returned from the manager if job.params['type'] == 'finish_transfer': protocol = job.params['protocol'] # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY if protocol in ['http', 'https']: result_dict = job.params['result'] library_dataset_name = result_dict['name'] extension = result_dict['datatype'] elif protocol in ['scp']: # In this case, job.params will be a dictionary that contains a key named 'result'. The value # of the result key is a dictionary that looks something like: # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed', # 'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2, # 'local_path': '/tmp/kjl2Ss4', 'password': '******', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'} try: tj = self.sa_session.query(self.app.model.TransferJob).get( int(job.params['transfer_job_id'])) result_dict = tj.params result_dict['local_path'] = tj.path except Exception as e: log.error( "Updated transfer result unavailable, using old result. Error was: %s" % str(e)) result_dict = job.params['result'] library_dataset_name = result_dict['name'] # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin).. extension = sniff.guess_ext( result_dict['local_path'], sniff_order=self.app.datatypes_registry.sniff_order) self._update_sample_dataset_status(protocol=job.params['protocol'], sample_id=int( job.params['sample_id']), result_dict=result_dict, new_status=new_status, error_msg='') sample = self.sa_session.query(self.app.model.Sample).get( int(job.params['sample_id'])) ld = self.app.model.LibraryDataset(folder=sample.folder, name=library_dataset_name) self.sa_session.add(ld) self.sa_session.flush() self.app.security_agent.copy_library_permissions( FakeTrans(self.app), sample.folder, ld) ldda = self.app.model.LibraryDatasetDatasetAssociation( name=library_dataset_name, extension=extension, dbkey='?', library_dataset=ld, create_dataset=True, sa_session=self.sa_session) ldda.message = 'Transferred by the Data Transfer Plugin' self.sa_session.add(ldda) self.sa_session.flush() ldda.state = ldda.states.QUEUED # flushed in the set property ld.library_dataset_dataset_association_id = ldda.id self.sa_session.add(ld) self.sa_session.flush() try: # Move the dataset from its temporary location shutil.move(job.transfer_job.path, ldda.file_name) ldda.init_meta() for name, spec in ldda.metadata.spec.items(): if name not in ['name', 'info', 'dbkey', 'base_name']: if spec.get('default'): setattr(ldda.metadata, name, spec.unwrap(spec.get('default'))) self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool, FakeTrans(self.app, history=sample.history, user=sample.request.user), incoming={'input1': ldda}) ldda.state = ldda.states.OK # TODO: not sure if this flush is necessary self.sa_session.add(ldda) self.sa_session.flush() except Exception as e: log.exception( 'Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % (str(job.transfer_job.id), str(job.id))) ldda.state = ldda.states.ERROR if sample.workflow: log.debug("\n\nLogging sample mappings as: %s" % sample.workflow['mappings']) log.debug("job.params: %s" % job.params) # We have a workflow. Update all mappings to ldda's, and when the final one is done # execute_workflow with either the provided history, or a new one. sub_done = True rep_done = False for k, v in sample.workflow['mappings'].items(): if 'hda' not in v and v['ds_tag'].startswith('hi|'): sample.workflow['mappings'][k][ 'hda'] = self.app.security.decode_id( v['ds_tag'][3:]) for key, value in sample.workflow['mappings'].items(): if 'url' in value and value['url'] == job.params['result'][ 'url']: # DBTODO Make sure all ds| mappings get the URL of the dataset, for linking to later. # If this dataset maps to what we just finished, update the ldda id in the sample. sample.workflow['mappings'][key]['ldda'] = ldda.id rep_done = True # DBTODO replace the hi| mappings with the hda here. Just rip off the first three chars. elif 'ldda' not in value and 'hda' not in value: # We're not done if some mappings still don't have ldda or hda mappings. sub_done = False if sub_done and rep_done: if not sample.history: new_history = self.app.model.History( name="New History From %s" % sample.name, user=sample.request.user) self.sa_session.add(new_history) sample.history = new_history self.sa_session.flush() self._execute_workflow(sample) # Check the workflow for substitution done-ness self.sa_session.add(sample) self.sa_session.flush() elif sample.history: # We don't have a workflow, but a history was provided. # No processing, go ahead and chunk everything in the history. if ldda.dataset.state in [ 'new', 'upload', 'queued', 'running', 'empty', 'discarded' ]: log.error( "Cannot import dataset '%s' to user history since its state is '%s'. " % (ldda.name, ldda.dataset.state)) elif ldda.dataset.state in ['ok', 'error']: ldda.to_history_dataset_association( target_history=sample.history, add_to_history=True) # Finished job.state = self.app.model.DeferredJob.states.OK self.sa_session.add(job) self.sa_session.flush() # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.COMPLETE self._update_sample_dataset_status( protocol=job.params['protocol'], sample_id=int(job.params['sample_id']), result_dict=job.params['result'], new_status=new_status, error_msg='') if sample.datasets and not sample.untransferred_dataset_files: # Update the state of the sample to the sample's request type's final state. new_state = sample.request.type.final_sample_state self._update_sample_state(sample.id, new_state) # Update the state of the request, if possible self._update_request_state(sample.request.id)
file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name dataset.is_multi_byte = is_multi_byte # See if we have an empty file if not os.path.exists( dataset.path ): file_err( 'Uploaded temporary file (%s) does not exist. Please' % dataset.path, dataset, json_file ) return if not os.path.getsize( dataset.path ) > 0: file_err( 'The uploaded file is empty', dataset, json_file ) return if 'is_multi_byte' not in dir( dataset ): dataset.is_multi_byte = util.is_multi_byte( open( dataset.path, 'r' ).read( 1024 ) ) if dataset.is_multi_byte: ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) data_type = ext else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip( dataset.path ) if is_gzipped and not is_valid: file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) return elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ) ) gzipped_file = gzip.GzipFile( dataset.path ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE )
def handle_upload( registry, path, # dataset.path requested_ext, # dataset.file_type name, # dataset.name, tmp_prefix, tmp_dir, check_content, link_data_only, in_place, auto_decompress, convert_to_posix_lines, convert_spaces_to_tabs, ): stdout = None converted_path = None multi_file_zip = False # Does the first 1K contain a null? is_binary = check_binary(path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if auto_decompress and is_zip(path) and not is_single_file_zip(path): multi_file_zip = True try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal( path, registry, ext=requested_ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip('.'), convert_to_posix_lines=convert_to_posix_lines, convert_spaces_to_tabs=convert_spaces_to_tabs, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif requested_ext == 'auto': ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary) else: ext = requested_ext # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if requested_ext != 'auto': datatype = registry.get_datatype_by_extension(requested_ext) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path): stdout = ("Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=requested_ext)) # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ("Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ("The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) if multi_file_zip and not getattr(datatype, 'compressed', False): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' return stdout, ext, datatype, is_binary, converted_path
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) # Follow upload.py logic but without the auto-decompress logic. registry = upload_config.registry check_content = upload_config.check_content data_type, ext = None, requested_ext is_binary = check_binary(path) if is_binary: data_type, ext = handle_sniffable_binary_check( data_type, ext, path, registry) if data_type is None: root_datatype = registry.get_datatype_by_extension(ext) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = ext elif is_binary: data_type, ext = handle_unsniffable_binary_check( data_type, ext, path, name, is_binary, requested_ext, check_content, registry) if not data_type and check_content and check_html(path): raise UploadProblemException( 'The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only: if to_posix_lines: if space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( path, in_place=in_place, tmp_dir=".") else: line_count, converted_path = sniff.convert_newlines( path, in_place=in_place, tmp_dir=".") else: if space_to_tab: line_count, converted_path = sniff.sep2tabs( path, in_place=in_place, tmp_dir=".") if requested_ext == 'auto': ext = sniff.guess_ext(converted_path or path, registry.sniff_order) else: ext = requested_ext data_type = ext if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and requested_ext: ext = requested_ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval
def add_file( self, trans, temp_name, file_name, file_type, is_multi_byte, dbkey, info=None, space_to_tab=False, precreated_dataset=None ): def dataset_no_data_error( data, message = 'there was an error uploading your file' ): data.info = "No data: %s." % message data.state = data.states.ERROR if data.extension is None: data.extension = 'data' return data data_type = None if precreated_dataset is not None: data = precreated_dataset else: data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True ) trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) ) # See if we have an empty file if not os.path.getsize( temp_name ) > 0: return dataset_no_data_error( data, message = 'you attempted to upload an empty file' ) #raise BadFileException( "you attempted to upload an empty file." ) if is_multi_byte: ext = sniff.guess_ext( temp_name, is_multi_byte=True ) else: if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, # we'll decompress on the fly. is_gzipped, is_valid = self.check_gzip( temp_name ) if is_gzipped and not is_valid: return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) #raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( temp_name ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) return dataset_no_data_error( data, message = 'problem decompressing gzipped data' ) #raise BadFileException( 'problem decompressing gzipped data.' ) if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file shutil.move( uncompressed, temp_name ) file_name = file_name.rstrip( '.gz' ) data_type = 'gzip' ext = '' if not data_type: # See if we have a zip archive is_zipped, is_valid, test_ext = self.check_zip( temp_name ) if is_zipped and not is_valid: return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) #raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_zipped and is_valid: # Currently, we force specific tools to handle this case. We also require the user # to manually set the incoming file_type if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip': return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'" ) #raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) elif test_ext == 'txt' and file_type != 'txtseq.zip': return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'" ) #raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ): return dataset_no_data_error( data, message = "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files" ) #raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) data_type = 'zip' ext = file_type if not data_type: if self.check_binary( temp_name ): parts = file_name.split( "." ) if len( parts ) > 1: ext = parts[1].strip().lower() if not( ext == 'ab1' or ext == 'scf' ): return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) #raise BadFileException( "you attempted to upload an inappropriate file." ) if ext == 'ab1' and file_type != 'ab1': return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files" ) #raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) elif ext == 'scf' and file_type != 'scf': return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Scf' when uploading scf files" ) #raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) data_type = 'binary' if not data_type: # We must have a text file if trans.app.datatypes_registry.get_datatype_by_extension( file_type ).composite_type != 'auto_primary_file' and self.check_html( temp_name ): return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) #raise BadFileException( "you attempted to upload an inappropriate file." ) if data_type != 'binary' and data_type != 'zip': if space_to_tab: self.line_count = sniff.convert_newlines_sep2tabs( temp_name ) else: self.line_count = sniff.convert_newlines( temp_name ) if file_type == 'auto': ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: ext = file_type data_type = ext if info is None: info = 'uploaded %s file' %data_type data.extension = ext data.name = file_name data.dbkey = dbkey data.info = info data.flush() shutil.move( temp_name, data.file_name ) data.state = data.states.OK data.set_size() data.init_meta() if self.line_count is not None: try: if is_multi_byte: data.set_multi_byte_peek( line_count=self.line_count ) else: data.set_peek( line_count=self.line_count ) except: if is_multi_byte: data.set_multi_byte_peek() else: data.set_peek() else: if is_multi_byte: data.set_multi_byte_peek() else: data.set_peek() # validate incomming data # Commented by greg on 3/14/07 # for error in data.datatype.validate( data ): # data.add_validation_error( # model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) ) if data.missing_meta(): data.datatype.set_meta( data ) dbkey_to_store = dbkey if type( dbkey_to_store ) == type( [] ): dbkey_to_store = dbkey[0] if precreated_dataset is not None: trans.history.genome_build = dbkey_to_store else: trans.history.add_dataset( data, genome_build=dbkey_to_store ) trans.app.model.flush() trans.log_event( "Added dataset %d to history %d" %( data.id, trans.history.id ), tool_id="upload" ) return data
try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) ) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset an image? image = check_image( dataset.path ) if image: if not PIL: image = None # get_image_ext() returns None if nor a supported Image type ext = get_image_ext( dataset.path, image ) data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext( dataset.path, is_multi_byte=True ) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary( dataset.path ) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension( dataset.file_type ) if getattr( root_datatype, 'compressed', False ): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = util.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset content multi-byte? if dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, is_multi_byte=True) # Is dataset content supported sniffable binary? elif check_bam(dataset.path): ext = 'bam' data_type = 'bam' elif check_sff(dataset.path): ext = 'sff' data_type = 'sff' else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
dataset.is_multi_byte = util.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError, e: dataset.is_multi_byte = False # Is dataset an image? image = check_image(dataset.path) if image: if not PIL: image = None # get_image_ext() returns None if nor a supported Image type ext = get_image_ext(dataset.path, image) data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, is_multi_byte=True) # Is dataset content supported sniffable binary? else: type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid:
def run_job(self, job): if job.params['type'] == 'init_transfer': # TODO: don't create new downloads on restart. if job.params['protocol'] in ['http', 'https']: results = [] for result in job.params['results'].values(): result['transfer_job'] = self.app.transfer_manager.new( protocol=job.params['protocol'], name=result['name'], datatype=result['datatype'], url=result['url']) results.append(result) elif job.params['protocol'] == 'scp': results = [] result = {} sample_datasets_dict = job.params['sample_datasets_dict'] # sample_datasets_dict looks something like the following. The outer dictionary keys are SampleDataset ids. # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7, # 'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}} for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items( ): result = {} result['transfer_job'] = self.app.transfer_manager.new( protocol=job.params['protocol'], host=job.params['host'], user_name=job.params['user_name'], password=job.params['password'], sample_dataset_id=sample_dataset_id, status=sample_dataset_info_dict['status'], name=sample_dataset_info_dict['name'], file_path=sample_dataset_info_dict['file_path'], sample_id=sample_dataset_info_dict['sample_id'], external_service_id=sample_dataset_info_dict[ 'external_service_id'], error_msg=sample_dataset_info_dict['error_msg'], size=sample_dataset_info_dict['size']) results.append(result) self.app.transfer_manager.run([r['transfer_job'] for r in results]) for result in results: transfer_job = result.pop('transfer_job') self.create_job(None, transfer_job_id=transfer_job.id, result=transfer_job.params, sample_id=job.params['sample_id']) # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE self._update_sample_dataset_status( protocol=job.params['protocol'], sample_id=job.params['sample_id'], result_dict=transfer_job.params, new_status=new_status, error_msg='') job.state = self.app.model.DeferredJob.states.OK self.sa_session.add(job) self.sa_session.flush() # TODO: Error handling: failure executing, or errors returned from the manager if job.params['type'] == 'finish_transfer': protocol = job.params['protocol'] # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY if protocol in ['http', 'https']: result_dict = job.params['result'] library_dataset_name = result_dict['name'] extension = result_dict['datatype'] elif protocol in ['scp']: # In this case, job.params will be a dictionary that contains a key named 'result'. The value # of the result key is a dictionary that looks something like: # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed', # 'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2, # 'local_path': '/tmp/kjl2Ss4', 'password': '******', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'} try: tj = self.sa_session.query(self.app.model.TransferJob).get( int(job.params['transfer_job_id'])) result_dict = tj.params result_dict['local_path'] = tj.path except Exception, e: log.error( "Updated transfer result unavailable, using old result. Error was: %s" % str(e)) result_dict = job.params['result'] library_dataset_name = result_dict['name'] # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin).. extension = sniff.guess_ext( result_dict['local_path'], sniff_order=self.app.datatypes_registry.sniff_order) self._update_sample_dataset_status(protocol=job.params['protocol'], sample_id=int( job.params['sample_id']), result_dict=result_dict, new_status=new_status, error_msg='') sample = self.sa_session.query(self.app.model.Sample).get( int(job.params['sample_id'])) ld = self.app.model.LibraryDataset(folder=sample.folder, name=library_dataset_name) self.sa_session.add(ld) self.sa_session.flush() self.app.security_agent.copy_library_permissions( FakeTrans(self.app), sample.folder, ld) ldda = self.app.model.LibraryDatasetDatasetAssociation( name=library_dataset_name, extension=extension, dbkey='?', library_dataset=ld, create_dataset=True, sa_session=self.sa_session) ldda.message = 'Transferred by the Data Transfer Plugin' self.sa_session.add(ldda) self.sa_session.flush() ldda.state = ldda.states.QUEUED # flushed in the set property ld.library_dataset_dataset_association_id = ldda.id self.sa_session.add(ld) self.sa_session.flush() try: # Move the dataset from its temporary location shutil.move(job.transfer_job.path, ldda.file_name) ldda.init_meta() for name, spec in ldda.metadata.spec.items(): if name not in ['name', 'info', 'dbkey', 'base_name']: if spec.get('default'): setattr(ldda.metadata, name, spec.unwrap(spec.get('default'))) self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool, FakeTrans(self.app, history=sample.history, user=sample.request.user), incoming={'input1': ldda}) ldda.state = ldda.states.OK # TODO: not sure if this flush is necessary self.sa_session.add(ldda) self.sa_session.flush() except Exception, e: log.exception( 'Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % \ ( str( job.transfer_job.id ), str( job.id ) ) ) ldda.state = ldda.states.ERROR
def handle_upload( registry, path, # dataset.path requested_ext, # dataset.file_type name, # dataset.name, tmp_prefix, tmp_dir, check_content, link_data_only, in_place, auto_decompress, convert_to_posix_lines, convert_spaces_to_tabs, ): stdout = None converted_path = None # Does the first 1K contain a null? is_binary = check_binary(path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if auto_decompress and is_zip(path) and not is_single_file_zip(path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal( path, registry, ext=requested_ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip( '.'), convert_to_posix_lines=convert_to_posix_lines, convert_spaces_to_tabs=convert_spaces_to_tabs, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif requested_ext == 'auto': ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary) else: ext = requested_ext # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if requested_ext != 'auto': datatype = registry.get_datatype_by_extension(requested_ext) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=requested_ext)) # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) return stdout, ext, datatype, is_binary, converted_path
def add_file( self, trans, folder_id, file_obj, name, file_format, dbkey, roles, info='no info', space_to_tab=False, replace_dataset=None, library_item_info_template=None, template_elements={}, message=None ): folder = trans.app.model.LibraryFolder.get( folder_id ) data_type = None line_count = 0 temp_name, is_multi_byte = sniff.stream_to_file( file_obj ) # See if we have an empty file if not os.path.getsize( temp_name ) > 0: raise BadFileException( "you attempted to upload an empty file." ) if is_multi_byte: ext = sniff.guess_ext( temp_name, is_multi_byte=True ) else: if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress on the fly. is_gzipped, is_valid = self.check_gzip( temp_name ) if is_gzipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( temp_name ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) raise BadFileException( 'problem uncompressing gzipped data.' ) if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file shutil.move( uncompressed, temp_name ) name = name.rstrip( '.gz' ) data_type = 'gzip' ext = '' if not data_type: # See if we have a zip archive is_zipped, is_valid, test_ext = self.check_zip( temp_name ) if is_zipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_zipped and is_valid: # Currently, we force specific tools to handle this case. We also require the user # to manually set the incoming file_format if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_format != 'binseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) elif test_ext == 'txt' and file_format != 'txtseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) if not ( file_format == 'binseq.zip' or file_format == 'txtseq.zip' ): raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) data_type = 'zip' ext = file_format if not data_type: if self.check_binary( temp_name ): try: ext = name.split( "." )[1].strip().lower() except: ext = '' if not( ext == 'ab1' or ext == 'scf' ): raise BadFileException( "you attempted to upload an inappropriate file." ) if ext == 'ab1' and file_format != 'ab1': raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) elif ext == 'scf' and file_format != 'scf': raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) data_type = 'binary' if not data_type: # We must have a text file if self.check_html( temp_name ): raise BadFileException( "you attempted to upload an inappropriate file." ) if data_type != 'binary' and data_type != 'zip': if space_to_tab: line_count = sniff.convert_newlines_sep2tabs( temp_name ) elif os.stat( temp_name ).st_size < 262144000: # 250MB line_count = sniff.convert_newlines( temp_name ) else: if sniff.check_newlines( temp_name ): line_count = sniff.convert_newlines( temp_name ) else: line_count = None if file_format == 'auto': ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: ext = file_format data_type = ext if info is None: info = 'uploaded %s file' % data_type if file_format == 'auto': data_type = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: data_type = file_format if replace_dataset: # The replace_dataset param ( when not None ) refers to a LibraryDataset that is being replaced with a new version. library_dataset = replace_dataset else: # If replace_dataset is None, the Library level permissions will be taken from the folder and applied to the new # LibraryDataset, and the current user's DefaultUserPermissions will be applied to the associated Dataset. library_dataset = trans.app.model.LibraryDataset( folder=folder, name=name, info=info ) library_dataset.flush() trans.app.security_agent.copy_library_permissions( folder, library_dataset ) ldda = trans.app.model.LibraryDatasetDatasetAssociation( name=name, info=info, extension=data_type, dbkey=dbkey, library_dataset=library_dataset, user=trans.get_user(), create_dataset=True ) ldda.message = message ldda.flush() # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset trans.app.security_agent.copy_library_permissions( library_dataset, ldda ) if replace_dataset: # Copy the Dataset level permissions from replace_dataset to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.copy_dataset_permissions( replace_dataset.library_dataset_dataset_association.dataset, ldda.dataset ) else: # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.set_all_dataset_permissions( ldda.dataset, trans.app.security_agent.user_get_default_permissions( trans.get_user() ) ) folder.add_library_dataset( library_dataset, genome_build=dbkey ) folder.flush() library_dataset.library_dataset_dataset_association_id = ldda.id library_dataset.flush() # Handle any templates included in the upload form if library_item_info_template: user = trans.get_user() library_item_info = trans.app.model.LibraryItemInfo( user=user ) library_item_info.library_item_info_template = library_item_info_template library_item_info.flush() trans.app.security_agent.copy_library_permissions( library_item_info_template, library_item_info ) for template_element in library_item_info_template.elements: info_element_value = template_elements.get( "info_element_%s_%s" % ( library_item_info_template.id, template_element.id ), None ) info_element = trans.app.model.LibraryItemInfoElement() info_element.contents = info_element_value info_element.library_item_info_template_element = template_element info_element.library_item_info = library_item_info info_element.flush() library_item_info_association = trans.app.model.LibraryDatasetDatasetInfoAssociation( user=user ) library_item_info_association.set_library_item( ldda ) library_item_info_association.library_item_info = library_item_info library_item_info_association.flush() # If roles were selected upon upload, restrict access to the Dataset to those roles if roles: for role in roles: dp = trans.app.model.DatasetPermissions( RBACAgent.permitted_actions.DATASET_ACCESS.action, ldda.dataset, role ) dp.flush() shutil.move( temp_name, ldda.dataset.file_name ) ldda.state = ldda.states.OK ldda.init_meta() if line_count: try: if is_multi_byte: ldda.set_multi_byte_peek( line_count=line_count ) else: ldda.set_peek( line_count=line_count ) except: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() else: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() ldda.set_size() if ldda.missing_meta(): ldda.datatype.set_meta( ldda ) ldda.flush() return ldda
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def run_job( self, job ): if job.params[ 'type' ] == 'init_transfer': # TODO: don't create new downloads on restart. if job.params[ 'protocol' ] in [ 'http', 'https' ]: results = [] for result in job.params[ 'results' ].values(): result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ], name=result[ 'name' ], datatype=result[ 'datatype' ], url=result[ 'url' ] ) results.append( result ) elif job.params[ 'protocol' ] == 'scp': results = [] result = {} sample_datasets_dict = job.params[ 'sample_datasets_dict' ] # sample_datasets_dict looks something like the following. The outer dictionary keys are SampleDataset ids. # {'7': {'status': 'Not started', 'name': '3.bed', 'file_path': '/tmp/library/3.bed', 'sample_id': 7, # 'external_service_id': 2, 'error_msg': '', 'size': '8.0K'}} for sample_dataset_id, sample_dataset_info_dict in sample_datasets_dict.items(): result = {} result[ 'transfer_job' ] = self.app.transfer_manager.new( protocol=job.params[ 'protocol' ], host=job.params[ 'host' ], user_name=job.params[ 'user_name' ], password=job.params[ 'password' ], sample_dataset_id=sample_dataset_id, status=sample_dataset_info_dict[ 'status' ], name=sample_dataset_info_dict[ 'name' ], file_path=sample_dataset_info_dict[ 'file_path' ], sample_id=sample_dataset_info_dict[ 'sample_id' ], external_service_id=sample_dataset_info_dict[ 'external_service_id' ], error_msg=sample_dataset_info_dict[ 'error_msg' ], size=sample_dataset_info_dict[ 'size' ] ) results.append( result ) self.app.transfer_manager.run( [ r[ 'transfer_job' ] for r in results ] ) for result in results: transfer_job = result.pop( 'transfer_job' ) self.create_job( None, transfer_job_id=transfer_job.id, result=transfer_job.params, sample_id=job.params[ 'sample_id' ] ) # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.IN_QUEUE self._update_sample_dataset_status( protocol=job.params[ 'protocol' ], sample_id=job.params[ 'sample_id' ], result_dict=transfer_job.params, new_status=new_status, error_msg='' ) job.state = self.app.model.DeferredJob.states.OK self.sa_session.add( job ) self.sa_session.flush() # TODO: Error handling: failure executing, or errors returned from the manager if job.params[ 'type' ] == 'finish_transfer': protocol = job.params[ 'protocol' ] # Update the state of the relevant SampleDataset new_status = self.app.model.SampleDataset.transfer_status.ADD_TO_LIBRARY if protocol in [ 'http', 'https' ]: result_dict = job.params[ 'result' ] library_dataset_name = result_dict[ 'name' ] extension = result_dict[ 'datatype' ] elif protocol in [ 'scp' ]: # In this case, job.params will be a dictionary that contains a key named 'result'. The value # of the result key is a dictionary that looks something like: # {'sample_dataset_id': '8', 'status': 'Not started', 'protocol': 'scp', 'name': '3.bed', # 'file_path': '/data/library/3.bed', 'host': '127.0.0.1', 'sample_id': 8, 'external_service_id': 2, # 'local_path': '/tmp/kjl2Ss4', 'password': '******', 'user_name': 'gvk', 'error_msg': '', 'size': '8.0K'} try: tj = self.sa_session.query( self.app.model.TransferJob ).get( int( job.params['transfer_job_id'] ) ) result_dict = tj.params result_dict['local_path'] = tj.path except Exception, e: log.error( "Updated transfer result unavailable, using old result. Error was: %s" % str( e ) ) result_dict = job.params[ 'result' ] library_dataset_name = result_dict[ 'name' ] # Determine the data format (see the relevant TODO item in the manual_data_transfer plugin).. extension = sniff.guess_ext( result_dict[ 'local_path' ], sniff_order=self.app.datatypes_registry.sniff_order ) self._update_sample_dataset_status( protocol=job.params[ 'protocol' ], sample_id=int( job.params[ 'sample_id' ] ), result_dict=result_dict, new_status=new_status, error_msg='' ) sample = self.sa_session.query( self.app.model.Sample ).get( int( job.params[ 'sample_id' ] ) ) ld = self.app.model.LibraryDataset( folder=sample.folder, name=library_dataset_name ) self.sa_session.add( ld ) self.sa_session.flush() self.app.security_agent.copy_library_permissions( FakeTrans( self.app ), sample.folder, ld ) ldda = self.app.model.LibraryDatasetDatasetAssociation( name = library_dataset_name, extension = extension, dbkey = '?', library_dataset = ld, create_dataset = True, sa_session = self.sa_session ) ldda.message = 'Transferred by the Data Transfer Plugin' self.sa_session.add( ldda ) self.sa_session.flush() ldda.state = ldda.states.QUEUED # flushed in the set property ld.library_dataset_dataset_association_id = ldda.id self.sa_session.add( ld ) self.sa_session.flush() try: # Move the dataset from its temporary location shutil.move( job.transfer_job.path, ldda.file_name ) ldda.init_meta() for name, spec in ldda.metadata.spec.items(): if name not in [ 'name', 'info', 'dbkey', 'base_name' ]: if spec.get( 'default' ): setattr( ldda.metadata, name, spec.unwrap( spec.get( 'default' ) ) ) self.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( self.app.datatypes_registry.set_external_metadata_tool, FakeTrans( self.app, history=sample.history, user=sample.request.user ), incoming = { 'input1':ldda } ) ldda.state = ldda.states.OK # TODO: not sure if this flush is necessary self.sa_session.add( ldda ) self.sa_session.flush() except Exception, e: log.exception( 'Failure preparing library dataset for finished transfer job (id: %s) via deferred job (id: %s):' % \ ( str( job.transfer_job.id ), str( job.id ) ) ) ldda.state = ldda.states.ERROR