def filenames_from_url_paste( url_paste, group_incoming, override_name = None, override_info = None ): filenames = [] if url_paste not in [ None, "" ]: if url_paste.lstrip().lower().startswith( 'http://' ) or url_paste.lstrip().lower().startswith( 'ftp://' ): url_paste = url_paste.replace( '\r', '' ).split( '\n' ) for line in url_paste: line = line.strip() if line: if not line.lower().startswith( 'http://' ) and not line.lower().startswith( 'ftp://' ): continue # non-url line, ignore precreated_name = line dataset_name = override_name if not dataset_name: dataset_name = line dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' try: temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( line ), prefix='url_paste' ) except Exception, e: temp_name = None precreated_name = str( e ) log.exception( 'exception in sniff.stream_to_file using url_paste %s: %s' % ( url_paste, str( e ) ) ) try: self.remove_temp_file( temp_name ) except: pass yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info ) #yield ( None, str( e ), False, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info is_valid = False for line in url_paste: #Trim off empty lines from begining line = line.rstrip( '\r\n' ) if line: is_valid = True break if is_valid: try: temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( url_paste ), prefix='strio_url_paste' ) except Exception, e: log.exception( 'exception in sniff.stream_to_file using StringIO.StringIO( url_paste ) %s: %s' % ( url_paste, str( e ) ) ) temp_name = None precreated_name = str( e ) try: self.remove_temp_file( temp_name ) except: pass yield ( temp_name, precreated_name, is_multi_byte, dataset_name, dataset_info )
def _has_src_to_path(upload_config, item, is_dataset=False): assert "src" in item, item src = item.get("src") name = item.get("name") if src == "url": url = item.get("url") try: path = sniff.stream_url_to_file( url, file_sources=get_file_sources(upload_config.working_directory)) except Exception as e: raise Exception("Failed to fetch url {}. {}".format(url, str(e))) if not is_dataset: # Actual target dataset will validate and put results in dict # that gets passed back to Galaxy. for hash_function in HASH_NAMES: hash_value = item.get(hash_function) if hash_value: _handle_hash_validation(upload_config, hash_function, hash_value, path) if name is None: name = url.split("/")[-1] elif src == "pasted": path = sniff.stream_to_file(StringIO(item["paste_content"])) if name is None: name = "Pasted Entry" else: assert src == "path" path = item["path"] if name is None: name = os.path.basename(path) return name, path
def _has_src_to_path(upload_config, item, is_dataset=False): assert "src" in item, item src = item.get("src") name = item.get("name") if src == "url": url = item.get("url") path = sniff.stream_url_to_file(url) if not is_dataset: # Actual target dataset will validate and put results in dict # that gets passed back to Galaxy. for hash_function in HASH_NAMES: hash_value = item.get(hash_function) if hash_value: _handle_hash_validation(upload_config, hash_function, hash_value, path) if name is None: name = url.split("/")[-1] elif src == "pasted": path = sniff.stream_to_file(StringIO(item["paste_content"])) if name is None: name = "Pasted Entry" else: assert src == "path" path = item["path"] if name is None: name = os.path.basename(path) return name, path
def add_composite_file( dataset, registry, json_file, output_path, files_path ): if dataset.composite_files: os.mkdir( files_path ) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch( **value ) if dataset.composite_file_paths[ value.name ] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name][ 'path' ] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) return dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir( output_path ) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) move_copy(dp, os.path.join( files_path, name ) )
def persist_uploads(params, trans): """ Turn any uploads in the submitted form to persisted files. """ if 'files' in params: new_files = [] for upload_dataset in params['files']: f = upload_dataset['file_data'] if isinstance(f, cgi_FieldStorage): assert not isinstance(f.file, StringIO) assert f.file.name != '<fdopen>' local_filename = util.mkstemp_ln(f.file.name, 'upload_file_data_') f.file.close() upload_dataset['file_data'] = dict( filename=f.filename, local_filename=local_filename) elif type(f) == dict and 'local_filename' not in f: raise Exception( 'Uploaded file was encoded in a way not understood by Galaxy.' ) if 'url_paste' in upload_dataset and upload_dataset[ 'url_paste'] and upload_dataset['url_paste'].strip() != '': upload_dataset['url_paste'] = stream_to_file( StringIO( validate_url( upload_dataset['url_paste'], trans.app.config.fetch_url_allowlist_ips)), prefix="strio_url_paste_") else: upload_dataset['url_paste'] = None new_files.append(upload_dataset) params['files'] = new_files return params
def add_composite_file(dataset, registry, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') <> -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen(dp), prefix='url_paste') except Exception, e: file_err('Unable to fetch %s\n%s' % (dp, str(e)), dataset, json_file) return dataset.path = temp_name dp = temp_name if not value.is_binary: if dataset.composite_file_paths[value.name].get( 'space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp) else: sniff.convert_newlines(dp) shutil.move(dp, os.path.join(files_path, name))
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urllib.urlopen( dataset.path) #page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception, e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name
def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.items(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: file_err('A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file(urlopen(dp), prefix='url_paste') except Exception as e: file_err('Unable to fetch %s\n%s' % (dp, str(e)), dataset, json_file) return dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[value.name].get('space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) shutil.move(dp, os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type) json_file.write(dumps(info) + "\n")
def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.items(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: raise UploadProblemException('A required composite data file was not provided (%s)' % name) elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name = sniff.stream_to_file(urlopen(dp), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dp, str(e))) dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[value.name].get('space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) shutil.move(dp, os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type) json_file.write(dumps(info) + "\n")
def url_upload(dataset): """ @summary: Upload an URL file and update dataset information. @param dataset: [Bunch] The dataset for the uploaded file. """ url = dataset.path page = urllib.urlopen( url ) temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) ) dataset.path = temp_name dataset.name = url.split('/')[-1]
def to_path(path_or_url): is_url = path_or_url.find('://') != -1 # todo fixme if is_url: try: temp_name = sniff.stream_to_file(urlopen(path_or_url), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, unicodify(e))) return temp_name, is_url return path_or_url, is_url
def to_path(path_or_url): is_url = path_or_url.find('://') != -1 # todo fixme if is_url: try: temp_name = sniff.stream_to_file(urlopen(path_or_url), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (path_or_url, str(e))) return temp_name, is_url return path_or_url, is_url
def add_file(dataset, json_file, output_path): data_type = None line_count = None if dataset.type == "url": try: temp_name, dataset.is_multi_byte = sniff.stream_to_file(urllib.urlopen(dataset.path), prefix="url_paste") except Exception, e: file_err("Unable to fetch %s\n%s" % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name
def add_file( dataset, json_file ): data_type = None line_count = None if dataset.type == 'url': try: temp_name, is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name dataset.is_multi_byte = is_multi_byte
def url_upload(dataset): """ @summary: Upload an URL file and update dataset information. @param dataset: [Bunch] The dataset for the uploaded file. """ url = dataset.path page = urllib.urlopen(url) temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) dataset.path = temp_name dataset.name = url.split('/')[-1]
def add_file(dataset, json_file, output_path): data_type = None line_count = None converted_path = None if dataset.type == 'url': try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen(dataset.path), prefix='url_paste') except Exception, e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name
def _has_src_to_path(item): assert "src" in item, item src = item.get("src") name = item.get("name") if src == "url": url = item.get("url") path = sniff.stream_url_to_file(url) if name is None: name = url.split("/")[-1] elif src == "pasted": path = sniff.stream_to_file(StringIO(item["paste_content"])) if name is None: name = "Pasted Entry" else: assert src == "path" path = item["path"] if name is None: name = os.path.basename(path) return name, path
def add_file( dataset, registry, json_file, output_path ): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get( 'link_data_only', 'copy_files' ) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file ) return if dataset.type == 'url': try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dataset.path ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name
def add_file( dataset, registry, json_file, output_path ): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get( 'link_data_only', 'copy_files' ) in_place = dataset.get( 'in_place', True ) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file ) return if dataset.type == 'url': try: page = urllib.urlopen( dataset.path ) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dataset.path, str( e ) ), dataset, json_file ) return dataset.path = temp_name
def get_data_file_filename( data_file, is_multi_byte = False, override_name = None, override_info = None ): dataset_name = override_name dataset_info = override_info def get_file_name( file_name ): file_name = file_name.split( '\\' )[-1] file_name = file_name.split( '/' )[-1] return file_name if 'local_filename' in dir( data_file ): # Use the existing file return data_file.local_filename, get_file_name( data_file.filename ), is_multi_byte elif 'filename' in dir( data_file ): #create a new tempfile try: temp_name, is_multi_byte = sniff.stream_to_file( data_file.file, prefix='upload' ) precreated_name = get_file_name( data_file.filename ) if not dataset_name: dataset_name = precreated_name if not dataset_info: dataset_info = 'uploaded file' return temp_name, get_file_name( data_file.filename ), is_multi_byte, dataset_name, dataset_info except Exception, e: log.exception( 'exception in sniff.stream_to_file using file %s: %s' % ( data_file.filename, str( e ) ) ) self.remove_temp_file( temp_name )
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info ): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance( data.datatype, datatypes.interval.Interval ): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset( data ) trans.app.model.flush() return data
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance(data.datatype, datatypes.interval.Interval): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset(data) trans.app.model.flush() return data
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def _resolve_item(item): # Might be a dataset or a composite upload. requested_ext = item.get("ext", None) registry = upload_config.registry datatype = registry.get_datatype_by_extension(requested_ext) composite = item.pop("composite", None) if datatype and datatype.composite_type: composite_type = datatype.composite_type assert composite_type == "auto_primary_file", "basic composite uploads not yet implemented" # get_composite_dataset_name finds dataset name from basename of contents # and such but we're not implementing that here yet. yagni? # also need name... metadata = { composite_file.substitute_name_with_metadata: datatype.metadata_spec[ composite_file.substitute_name_with_metadata ].default for composite_file in datatype.composite_files.values() if composite_file.substitute_name_with_metadata } name = item.get("name") or "Composite Dataset" metadata["base_name"] = name dataset = Bunch( name=name, metadata=metadata, ) writable_files = datatype.get_writable_files_for_dataset(dataset) primary_file = sniff.stream_to_file( StringIO(datatype.generate_primary_file(dataset)), prefix="upload_auto_primary_file", dir=".", ) extra_files_path = f"{primary_file}_extra" os.mkdir(extra_files_path) rval: Dict[str, Any] = { "name": name, "filename": primary_file, "ext": requested_ext, "link_data_only": False, "sources": [], "hashes": [], "extra_files": extra_files_path, } _copy_and_validate_simple_attributes(item, rval) composite_items = composite.get("elements", []) keys = list(writable_files.keys()) composite_item_idx = 0 for composite_item in composite_items: if composite_item_idx >= len(keys): # raise exception - too many files? pass key = keys[composite_item_idx] writable_file = writable_files[key] _, src_target = _has_src_to_path(upload_config, composite_item) # do the writing sniff.handle_composite_file( datatype, src_target, extra_files_path, key, writable_file.is_binary, ".", f"{os.path.basename(extra_files_path)}_", composite_item, ) composite_item_idx += 1 writable_files_idx = composite_item_idx while writable_files_idx < len(keys): key = keys[writable_files_idx] writable_file = writable_files[key] if not writable_file.optional: # raise Exception, non-optional file missing pass writable_files_idx += 1 return rval else: if composite: raise Exception(f"Non-composite datatype [{datatype}] attempting to be created with composite data.") return _resolve_item_with_primary(item)
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def get_uploaded_datasets(self, trans, context, override_name=None, override_info=None): def get_data_file_filename(data_file, override_name=None, override_info=None, purge=True): dataset_name = override_name def get_file_name(file_name): file_name = file_name.split('\\')[-1] file_name = file_name.split('/')[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name(data_file['filename']) return Bunch(type='file', path=data_file['local_filename'], name=dataset_name, purge_source=purge) except Exception: # The uploaded file should've been persisted by the upload tool action return Bunch(type=None, path=None, name=None) def get_url_paste_urls_or_filename(group_incoming, override_name=None, override_info=None): url_paste_file = group_incoming.get('url_paste', None) if url_paste_file is not None: url_paste = open(url_paste_file).read() def start_of_url(content): start_of_url_paste = content.lstrip()[0:10].lower() looks_like_url = False for url_prefix in URI_PREFIXES: if start_of_url_paste.startswith(url_prefix): looks_like_url = True break return looks_like_url if start_of_url(url_paste): for line in url_paste.replace("\r", "").split("\n"): line = line.strip() if line: if not start_of_url(line): continue # non-url line, ignore if "file://" in line: if not trans.user_is_admin: raise AdminRequiredException() elif not trans.app.config.allow_path_paste: raise ConfigDoesNotAllowException() upload_path = line[len("file://"):] dataset_name = os.path.basename(upload_path) else: dataset_name = line if override_name: dataset_name = override_name yield Bunch(type='url', path=line, name=dataset_name) else: dataset_name = 'Pasted Entry' # we need to differentiate between various url pastes here if override_name: dataset_name = override_name yield Bunch(type='file', path=url_paste_file, name=dataset_name) def get_one_filename(context): data_file = context['file_data'] url_paste = context['url_paste'] ftp_files = context['ftp_files'] name = context.get('NAME', None) info = context.get('INFO', None) uuid = context.get('uuid', None) or None # Turn '' to None file_type = context.get('file_type', None) dbkey = self.get_dbkey(context) warnings = [] to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True auto_decompress = False if context.get('auto_decompress', None) not in ["None", None, False]: auto_decompress = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) if file_bunch.path: if url_paste is not None and url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) if ftp_files: warnings.append( "All FTP uploaded file selections were ignored.") elif url_paste is not None and url_paste.strip( ): # we need to use url_paste for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: break if file_bunch.path and ftp_files is not None: warnings.append( "All FTP uploaded file selections were ignored.") elif ftp_files is not None and trans.user is not None: # look for files uploaded via FTP user_ftp_dir = trans.user_ftp_dir assert not os.path.islink( user_ftp_dir ), "User FTP directory cannot be a symbolic link" for dirpath, _dirnames, filenames in os.walk(user_ftp_dir): for filename in filenames: for ftp_filename in ftp_files: if ftp_filename == filename: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink( os.path.join(dirpath, filename)): ftp_data_file = { 'local_filename': os.path.abspath( os.path.join(user_ftp_dir, path)), 'filename': os.path.basename(path) } purge = getattr(trans.app.config, 'ftp_upload_purge', True) file_bunch = get_data_file_filename( ftp_data_file, override_name=name, override_info=info, purge=purge, ) if file_bunch.path: break if file_bunch.path: break if file_bunch.path: break file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab file_bunch.uuid = uuid if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey return file_bunch, warnings def get_filenames(context): rval = [] data_file = context['file_data'] ftp_files = context['ftp_files'] uuid = context.get('uuid', None) or None # Turn '' to None name = context.get('NAME', None) info = context.get('INFO', None) file_type = context.get('file_type', None) dbkey = self.get_dbkey(context) to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True auto_decompress = False if context.get('auto_decompress', None) not in ["None", None, False]: auto_decompress = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) file_bunch.uuid = uuid if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: file_bunch.uuid = uuid file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) # look for files uploaded via FTP valid_files = [] if ftp_files is not None: # Normalize input paths to ensure utf-8 encoding is normal form c. # This allows for comparison when the filesystem uses a different encoding than the browser. ftp_files = [ unicodedata.normalize('NFC', f) for f in ftp_files if isinstance(f, str) ] if trans.user is None: log.warning( f'Anonymous user passed values in ftp_files: {ftp_files}' ) ftp_files = [] # TODO: warning to the user (could happen if session has become invalid) else: user_ftp_dir = trans.user_ftp_dir assert not os.path.islink( user_ftp_dir ), "User FTP directory cannot be a symbolic link" for dirpath, _dirnames, filenames in os.walk(user_ftp_dir): for filename in filenames: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink( os.path.join(dirpath, filename)): # Normalize filesystem paths if isinstance(path, str): valid_files.append( unicodedata.normalize('NFC', path)) else: valid_files.append(path) else: ftp_files = [] for ftp_file in ftp_files: if ftp_file not in valid_files: log.warning( f'User passed an invalid file path in ftp_files: {ftp_file}' ) continue # TODO: warning to the user (could happen if file is already imported) ftp_data_file = { 'local_filename': os.path.abspath(os.path.join(user_ftp_dir, ftp_file)), 'filename': os.path.basename(ftp_file) } purge = getattr(trans.app.config, 'ftp_upload_purge', True) file_bunch = get_data_file_filename(ftp_data_file, override_name=name, override_info=info, purge=purge) if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) return rval file_type = self.get_file_type(context) file_count = self.get_file_count(trans, context) d_type = self.get_datatype(trans, context) dbkey = self.get_dbkey(context) tag_using_filenames = context.get('tag_using_filenames', False) tags = context.get('tags', False) force_composite = asbool(context.get('force_composite', 'False')) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [None for _ in range(file_count)] for group_incoming in context.get(self.name, []): i = int(group_incoming['__index__']) groups_incoming[i] = group_incoming if d_type.composite_type is not None or force_composite: # handle uploading of composite datatypes # Only one Dataset can be created dataset = Dataset() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} dataset.uuid = None dataset.tag_using_filenames = None dataset.tags = None # load metadata files_metadata = context.get(self.metadata_ref, {}) metadata_name_substition_default_dict = { composite_file.substitute_name_with_metadata: d_type.metadata_spec[ composite_file.substitute_name_with_metadata].default for composite_file in d_type.composite_files.values() if composite_file.substitute_name_with_metadata } for meta_name, meta_spec in d_type.metadata_spec.items(): if meta_spec.set_in_upload: if meta_name in files_metadata: meta_value = files_metadata[meta_name] if meta_name in metadata_name_substition_default_dict: meta_value = sanitize_for_filename( meta_value, default=metadata_name_substition_default_dict[ meta_name]) dataset.metadata[meta_name] = meta_value dataset.name = self.get_composite_dataset_name(context) if dataset.datatype.composite_type == 'auto_primary_file': # replace sniff here with just creating an empty file temp_name = sniff.stream_to_file( io.StringIO(d_type.generate_primary_file(dataset)), prefix='upload_auto_primary_file') dataset.primary_file = temp_name dataset.to_posix_lines = True dataset.auto_decompress = True dataset.space_to_tab = False else: file_bunch, warnings = get_one_filename(groups_incoming[0]) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.to_posix_lines = file_bunch.to_posix_lines dataset.auto_decompress = file_bunch.auto_decompress dataset.space_to_tab = file_bunch.space_to_tab if file_bunch.file_type: dataset.file_type = file_type if file_bunch.dbkey: dataset.dbkey = dbkey dataset.warnings.extend(warnings) if dataset.primary_file is None: # remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) if not force_composite: keys = [value.name for value in writable_files.values()] else: keys = [str(index) for index in range(file_count)] for i, group_incoming in enumerate( groups_incoming[writable_files_offset:]): key = keys[i + writable_files_offset] if not force_composite and group_incoming is None and not writable_files[ list(writable_files.keys())[keys.index(key)]].optional: dataset.warnings.append( f"A required composite file ({key}) was not specified." ) dataset.composite_files[key] = None else: file_bunch, warnings = get_one_filename(group_incoming) dataset.warnings.extend(warnings) if file_bunch.path: if force_composite: key = group_incoming.get("NAME") or i dataset.composite_files[key] = file_bunch.__dict__ elif not force_composite: dataset.composite_files[key] = None if not writable_files[list(writable_files.keys())[ keys.index(key)]].optional: dataset.warnings.append( f"A required composite file ({key}) was not specified." ) return [dataset] else: rval = [] for i, file_contexts in enumerate(context[self.name]): datasets = get_filenames(file_contexts) for dataset in datasets: override_file_type = self.get_file_type( context[self.name][i], parent_context=context) d_type = self.get_datatype(trans, context[self.name][i], parent_context=context) dataset.file_type = override_file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext(trans, context[self.name][i], parent_context=context) dataset.dbkey = self.get_dbkey(context[self.name][i], parent_context=context) dataset.tag_using_filenames = tag_using_filenames dataset.tags = tags rval.append(dataset) return rval
def get_uploaded_datasets(self, trans, context, override_name=None, override_info=None): def get_data_file_filename(data_file, override_name=None, override_info=None): dataset_name = override_name dataset_info = override_info def get_file_name(file_name): file_name = file_name.split('\\')[-1] file_name = file_name.split('/')[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name(data_file['filename']) if not dataset_info: dataset_info = 'uploaded file' return Bunch(type='file', path=data_file['local_filename'], name=dataset_name) #return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info except: # The uploaded file should've been persisted by the upload tool action return Bunch(type=None, path=None, name=None) #return None, None, None, None, None def get_url_paste_urls_or_filename(group_incoming, override_name=None, override_info=None): filenames = [] url_paste_file = group_incoming.get('url_paste', None) if url_paste_file is not None: url_paste = open(url_paste_file, 'r').read(1024) if url_paste.lstrip().lower().startswith( 'http://') or url_paste.lstrip().lower().startswith( 'ftp://') or url_paste.lstrip().lower().startswith( 'https://'): url_paste = url_paste.replace('\r', '').split('\n') for line in url_paste: line = line.strip() if line: if not line.lower().startswith( 'http://') and not line.lower().startswith( 'ftp://') and not line.lower( ).startswith('https://'): continue # non-url line, ignore dataset_name = override_name if not dataset_name: dataset_name = line dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' yield Bunch(type='url', path=line, name=dataset_name) #yield ( 'url', line, precreated_name, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info yield Bunch(type='file', path=url_paste_file, name=precreated_name) #yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info ) def get_one_filename(context): data_file = context['file_data'] url_paste = context['url_paste'] ftp_files = context['ftp_files'] name = context.get('NAME', None) info = context.get('INFO', None) uuid = context.get('uuid', None) or None # Turn '' to None warnings = [] to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) if file_bunch.path: if url_paste is not None and url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) if ftp_files: warnings.append( "All FTP uploaded file selections were ignored.") elif url_paste is not None and url_paste.strip( ): #we need to use url_paste for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: break if file_bunch.path and ftp_files is not None: warnings.append( "All FTP uploaded file selections were ignored.") elif ftp_files is not None and trans.user is not None: # look for files uploaded via FTP user_ftp_dir = trans.user_ftp_dir for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): for filename in filenames: for ftp_filename in ftp_files: if ftp_filename == filename: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink( os.path.join(dirpath, filename)): ftp_data_file = { 'local_filename': os.path.abspath( os.path.join(user_ftp_dir, path)), 'filename': os.path.basename(path) } file_bunch = get_data_file_filename( ftp_data_file, override_name=name, override_info=info) if file_bunch.path: break if file_bunch.path: break if file_bunch.path: break file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab file_bunch.uuid = uuid return file_bunch, warnings def get_filenames(context): rval = [] data_file = context['file_data'] url_paste = context['url_paste'] ftp_files = context['ftp_files'] uuid = context.get('uuid', None) or None # Turn '' to None name = context.get('NAME', None) info = context.get('INFO', None) to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True warnings = [] file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) file_bunch.uuid = uuid if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append(file_bunch) for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: file_bunch.uuid = uuid file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append(file_bunch) # look for files uploaded via FTP valid_files = [] if ftp_files is not None: # Normalize input paths to ensure utf-8 encoding is normal form c. # This allows for comparison when the filesystem uses a different encoding than the browser. ftp_files = [ unicodedata.normalize('NFC', f) for f in ftp_files if isinstance(f, unicode) ] if trans.user is None: log.warning( 'Anonymous user passed values in ftp_files: %s' % ftp_files) ftp_files = [] # TODO: warning to the user (could happen if session has become invalid) else: user_ftp_dir = trans.user_ftp_dir for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): for filename in filenames: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink( os.path.join(dirpath, filename)): # Normalize filesystem paths if isinstance(path, unicode): valid_files.append( unicodedata.normalize('NFC', path)) else: valid_files.append(path) else: ftp_files = [] for ftp_file in ftp_files: if ftp_file not in valid_files: log.warning( 'User passed an invalid file path in ftp_files: %s' % ftp_file) continue # TODO: warning to the user (could happen if file is already imported) ftp_data_file = { 'local_filename': os.path.abspath(os.path.join(user_ftp_dir, ftp_file)), 'filename': os.path.basename(ftp_file) } file_bunch = get_data_file_filename(ftp_data_file, override_name=name, override_info=info) if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append(file_bunch) return rval file_type = self.get_file_type(context) d_type = self.get_datatype(trans, context) dbkey = context.get('dbkey', None) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [None for filename in writable_files] for group_incoming in context.get(self.name, []): i = int(group_incoming['__index__']) groups_incoming[i] = group_incoming if d_type.composite_type is not None: #handle uploading of composite datatypes #Only one Dataset can be created dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} #load metadata files_metadata = context.get(self.metadata_ref, {}) metadata_name_substition_default_dict = dict([ (composite_file.substitute_name_with_metadata, d_type.metadata_spec[ composite_file.substitute_name_with_metadata].default) for composite_file in d_type.composite_files.values() if composite_file.substitute_name_with_metadata ]) for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: meta_value = files_metadata[meta_name] if meta_name in metadata_name_substition_default_dict: meta_value = sanitize_for_filename( meta_value, default=metadata_name_substition_default_dict[ meta_name]) dataset.metadata[meta_name] = meta_value dataset.precreated_name = dataset.name = self.get_composite_dataset_name( context) if dataset.datatype.composite_type == 'auto_primary_file': #replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO(d_type.generate_primary_file(dataset)), prefix='upload_auto_primary_file') dataset.primary_file = temp_name dataset.to_posix_lines = True dataset.space_to_tab = False else: file_bunch, warnings = get_one_filename(groups_incoming[0]) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.to_posix_lines = file_bunch.to_posix_lines dataset.space_to_tab = file_bunch.space_to_tab dataset.warnings.extend(warnings) if dataset.primary_file is None: #remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) keys = [value.name for value in writable_files.values()] for i, group_incoming in enumerate( groups_incoming[writable_files_offset:]): key = keys[i + writable_files_offset] if group_incoming is None and not writable_files[ writable_files.keys()[keys.index(key)]].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % (key)) dataset.composite_files[key] = None else: file_bunch, warnings = get_one_filename(group_incoming) dataset.warnings.extend(warnings) if file_bunch.path: dataset.composite_files[key] = file_bunch.__dict__ else: dataset.composite_files[key] = None if not writable_files[writable_files.keys()[keys.index( key)]].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % (key)) return [dataset] else: datasets = get_filenames(context[self.name][0]) rval = [] for dataset in datasets: dataset.file_type = file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext(trans, context) dataset.dbkey = dbkey rval.append(dataset) return rval
def _resolve_item(item): # Might be a dataset or a composite upload. requested_ext = item.get("ext", None) registry = upload_config.registry datatype = registry.get_datatype_by_extension(requested_ext) composite = item.pop("composite", None) if datatype and datatype.composite_type: composite_type = datatype.composite_type writable_files = datatype.writable_files assert composite_type == "auto_primary_file", "basic composite uploads not yet implemented" # get_composite_dataset_name finds dataset name from basename of contents # and such but we're not implementing that here yet. yagni? # also need name... dataset_bunch = Bunch() name = item.get("name") or 'Composite Dataset' dataset_bunch.name = name primary_file = sniff.stream_to_file( StringIO(datatype.generate_primary_file(dataset_bunch)), prefix='upload_auto_primary_file', dir=".") extra_files_path = primary_file + "_extra" os.mkdir(extra_files_path) rval = { "name": name, "filename": primary_file, "ext": requested_ext, "link_data_only": False, "sources": [], "hashes": [], "extra_files": extra_files_path, } _copy_and_validate_simple_attributes(item, rval) composite_items = composite.get("elements", []) keys = [value.name for value in writable_files.values()] composite_item_idx = 0 for composite_item in composite_items: if composite_item_idx >= len(keys): # raise exception - too many files? pass key = keys[composite_item_idx] writable_file = writable_files[key] _, src_target = _has_src_to_path(upload_config, composite_item) # do the writing sniff.handle_composite_file( datatype, src_target, extra_files_path, key, writable_file.is_binary, ".", os.path.basename(extra_files_path) + "_", composite_item, ) composite_item_idx += 1 writable_files_idx = composite_item_idx while writable_files_idx < len(keys): key = keys[writable_files_idx] writable_file = writable_files[key] if not writable_file.optional: # raise Exception, non-optional file missing pass writable_files_idx += 1 return rval else: if composite: raise Exception( "Non-composite datatype [%s] attempting to be created with composite data." % datatype) return _resolve_item_with_primary(item)
def get_uploaded_datasets(self, trans, context, override_name=None, override_info=None): def get_data_file_filename(data_file, override_name=None, override_info=None): dataset_name = override_name dataset_info = override_info def get_file_name(file_name): file_name = file_name.split('\\')[-1] file_name = file_name.split('/')[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name(data_file['filename']) if not dataset_info: dataset_info = 'uploaded file' return Bunch(type='file', path=data_file['local_filename'], name=get_file_name(data_file['filename'])) #return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info except: # The uploaded file should've been persisted by the upload tool action return Bunch(type=None, path=None, name=None) #return None, None, None, None, None def get_url_paste_urls_or_filename(group_incoming, override_name=None, override_info=None): filenames = [] url_paste_file = group_incoming.get('url_paste', None) if url_paste_file is not None: url_paste = open(url_paste_file, 'r').read(1024) if url_paste.lstrip().lower().startswith( 'http://') or url_paste.lstrip().lower().startswith( 'ftp://'): url_paste = url_paste.replace('\r', '').split('\n') for line in url_paste: line = line.strip() if line: if not line.lower().startswith( 'http://') and not line.lower().startswith( 'ftp://'): continue # non-url line, ignore precreated_name = line dataset_name = override_name if not dataset_name: dataset_name = line dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' yield Bunch(type='url', path=line, name=precreated_name) #yield ( 'url', line, precreated_name, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info yield Bunch(type='file', path=url_paste_file, name=precreated_name) #yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info ) def get_one_filename(context): data_file = context['file_data'] url_paste = context['url_paste'] name = context.get('NAME', None) info = context.get('INFO', None) warnings = [] space_to_tab = False if context.get('space_to_tab', None) not in ["None", None]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) if file_bunch.path and url_paste: if url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) else: #we need to use url_paste for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: break return file_bunch, warnings def get_filenames(context): rval = [] data_file = context['file_data'] url_paste = context['url_paste'] name = context.get('NAME', None) info = context.get('INFO', None) space_to_tab = False if context.get('space_to_tab', None) not in ["None", None]: space_to_tab = True warnings = [] file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) if file_bunch.path: file_bunch.space_to_tab = space_to_tab rval.append(file_bunch) for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info): if file_bunch.path: file_bunch.space_to_tab = space_to_tab rval.append(file_bunch) return rval file_type = self.get_file_type(context) d_type = self.get_datatype(trans, context) dbkey = context.get('dbkey', None) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [None for filename in writable_files] for group_incoming in context.get(self.name, []): i = int(group_incoming['__index__']) groups_incoming[i] = group_incoming if d_type.composite_type is not None: #handle uploading of composite datatypes #Only one Dataset can be created dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} #load metadata files_metadata = context.get(self.metadata_ref, {}) for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: dataset.metadata[meta_name] = files_metadata[meta_name] dataset_name = None dataset_info = None if dataset.datatype.composite_type == 'auto_primary_file': #replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO(d_type.generate_primary_file()), prefix='upload_auto_primary_file') dataset.primary_file = temp_name dataset.space_to_tab = False dataset.precreated_name = dataset.name = 'Uploaded Composite Dataset (%s)' % ( file_type) else: file_bunch, warnings = get_one_filename(groups_incoming[0]) if dataset.datatype.composite_type: precreated_name = 'Uploaded Composite Dataset (%s)' % ( file_type) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.space_to_tab = file_bunch.space_to_tab dataset.precreated_name = file_bunch.precreated_name dataset.name = file_bunch.precreated_name dataset.warnings.extend(file_bunch.warnings) if dataset.primary_file is None: #remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) keys = [value.name for value in writable_files.values()] for i, group_incoming in enumerate( groups_incoming[writable_files_offset:]): key = keys[i + writable_files_offset] if group_incoming is None and not writable_files[ writable_files.keys()[keys.index(key)]].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % (key)) dataset.composite_files[key] = None else: file_bunch, warnings = get_one_filename(group_incoming) if file_bunch.path: dataset.composite_files[key] = file_bunch.__dict__ else: dataset.composite_files[key] = None if not writable_files[writable_files.keys()[keys.index( key)]].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % (key)) return [dataset] else: datasets = get_filenames(context[self.name][0]) rval = [] for dataset in datasets: dataset.file_type = file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext(trans, context) dataset.dbkey = dbkey rval.append(dataset) return rval
def get_uploaded_datasets( self, trans, context, override_name=None, override_info=None ): def get_data_file_filename( data_file, override_name=None, override_info=None ): dataset_name = override_name dataset_info = override_info def get_file_name( file_name ): file_name = file_name.split( '\\' )[-1] file_name = file_name.split( '/' )[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name( data_file['filename'] ) if not dataset_info: dataset_info = 'uploaded file' return Bunch( type='file', path=data_file['local_filename'], name=dataset_name ) # return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info except: # The uploaded file should've been persisted by the upload tool action return Bunch( type=None, path=None, name=None ) # return None, None, None, None, None def get_url_paste_urls_or_filename( group_incoming, override_name=None, override_info=None ): url_paste_file = group_incoming.get( 'url_paste', None ) if url_paste_file is not None: url_paste = open( url_paste_file, 'r' ).read( 1024 ) if url_paste.lstrip().lower().startswith( 'http://' ) or url_paste.lstrip().lower().startswith( 'ftp://' ) or url_paste.lstrip().lower().startswith( 'https://' ): url_paste = url_paste.replace( '\r', '' ).split( '\n' ) for line in url_paste: line = line.strip() if line: if not line.lower().startswith( 'http://' ) and not line.lower().startswith( 'ftp://' ) and not line.lower().startswith( 'https://' ): continue # non-url line, ignore dataset_name = override_name if not dataset_name: dataset_name = line dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' yield Bunch( type='url', path=line, name=dataset_name ) # yield ( 'url', line, precreated_name, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' # we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info yield Bunch( type='file', path=url_paste_file, name=precreated_name ) # yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info ) def get_one_filename( context ): data_file = context['file_data'] url_paste = context['url_paste'] ftp_files = context['ftp_files'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) uuid = context.get( 'uuid', None ) or None # Turn '' to None warnings = [] to_posix_lines = False if context.get( 'to_posix_lines', None ) not in [ "None", None, False ]: to_posix_lines = True space_to_tab = False if context.get( 'space_to_tab', None ) not in [ "None", None, False ]: space_to_tab = True file_bunch = get_data_file_filename( data_file, override_name=name, override_info=info ) if file_bunch.path: if url_paste is not None and url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) if ftp_files: warnings.append( "All FTP uploaded file selections were ignored." ) elif url_paste is not None and url_paste.strip(): # we need to use url_paste for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info ): if file_bunch.path: break if file_bunch.path and ftp_files is not None: warnings.append( "All FTP uploaded file selections were ignored." ) elif ftp_files is not None and trans.user is not None: # look for files uploaded via FTP user_ftp_dir = trans.user_ftp_dir for ( dirpath, dirnames, filenames ) in os.walk( user_ftp_dir ): for filename in filenames: for ftp_filename in ftp_files: if ftp_filename == filename: path = relpath( os.path.join( dirpath, filename ), user_ftp_dir ) if not os.path.islink( os.path.join( dirpath, filename ) ): ftp_data_file = { 'local_filename' : os.path.abspath( os.path.join( user_ftp_dir, path ) ), 'filename' : os.path.basename( path ) } file_bunch = get_data_file_filename( ftp_data_file, override_name=name, override_info=info ) if file_bunch.path: break if file_bunch.path: break if file_bunch.path: break file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab file_bunch.uuid = uuid return file_bunch, warnings def get_filenames( context ): rval = [] data_file = context['file_data'] ftp_files = context['ftp_files'] uuid = context.get( 'uuid', None ) or None # Turn '' to None name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) to_posix_lines = False if context.get( 'to_posix_lines', None ) not in [ "None", None, False ]: to_posix_lines = True space_to_tab = False if context.get( 'space_to_tab', None ) not in [ "None", None, False ]: space_to_tab = True file_bunch = get_data_file_filename( data_file, override_name=name, override_info=info ) file_bunch.uuid = uuid if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) for file_bunch in get_url_paste_urls_or_filename( context, override_name=name, override_info=info ): if file_bunch.path: file_bunch.uuid = uuid file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) # look for files uploaded via FTP valid_files = [] if ftp_files is not None: # Normalize input paths to ensure utf-8 encoding is normal form c. # This allows for comparison when the filesystem uses a different encoding than the browser. ftp_files = [unicodedata.normalize('NFC', f) for f in ftp_files if isinstance(f, unicode)] if trans.user is None: log.warning( 'Anonymous user passed values in ftp_files: %s' % ftp_files ) ftp_files = [] # TODO: warning to the user (could happen if session has become invalid) else: user_ftp_dir = trans.user_ftp_dir for ( dirpath, dirnames, filenames ) in os.walk( user_ftp_dir ): for filename in filenames: path = relpath( os.path.join( dirpath, filename ), user_ftp_dir ) if not os.path.islink( os.path.join( dirpath, filename ) ): # Normalize filesystem paths if isinstance(path, unicode): valid_files.append(unicodedata.normalize('NFC', path )) else: valid_files.append(path) else: ftp_files = [] for ftp_file in ftp_files: if ftp_file not in valid_files: log.warning( 'User passed an invalid file path in ftp_files: %s' % ftp_file ) continue # TODO: warning to the user (could happen if file is already imported) ftp_data_file = { 'local_filename' : os.path.abspath( os.path.join( user_ftp_dir, ftp_file ) ), 'filename' : os.path.basename( ftp_file ) } file_bunch = get_data_file_filename( ftp_data_file, override_name=name, override_info=info ) if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) return rval file_type = self.get_file_type( context ) d_type = self.get_datatype( trans, context ) dbkey = context.get( 'dbkey', None ) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [ None for _ in writable_files ] for group_incoming in context.get( self.name, [] ): i = int( group_incoming['__index__'] ) groups_incoming[ i ] = group_incoming if d_type.composite_type is not None: # handle uploading of composite datatypes # Only one Dataset can be created dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} dataset.uuid = None # load metadata files_metadata = context.get( self.metadata_ref, {} ) metadata_name_substition_default_dict = dict( [ ( composite_file.substitute_name_with_metadata, d_type.metadata_spec[ composite_file.substitute_name_with_metadata ].default ) for composite_file in d_type.composite_files.values() if composite_file.substitute_name_with_metadata ] ) for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: meta_value = files_metadata[ meta_name ] if meta_name in metadata_name_substition_default_dict: meta_value = sanitize_for_filename( meta_value, default=metadata_name_substition_default_dict[ meta_name ] ) dataset.metadata[ meta_name ] = meta_value dataset.precreated_name = dataset.name = self.get_composite_dataset_name( context ) if dataset.datatype.composite_type == 'auto_primary_file': # replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file( dataset ) ), prefix='upload_auto_primary_file' ) dataset.primary_file = temp_name dataset.to_posix_lines = True dataset.space_to_tab = False else: file_bunch, warnings = get_one_filename( groups_incoming[ 0 ] ) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.to_posix_lines = file_bunch.to_posix_lines dataset.space_to_tab = file_bunch.space_to_tab dataset.warnings.extend( warnings ) if dataset.primary_file is None: # remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) keys = [ value.name for value in writable_files.values() ] for i, group_incoming in enumerate( groups_incoming[ writable_files_offset : ] ): key = keys[ i + writable_files_offset ] if group_incoming is None and not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) dataset.composite_files[ key ] = None else: file_bunch, warnings = get_one_filename( group_incoming ) dataset.warnings.extend( warnings ) if file_bunch.path: dataset.composite_files[ key ] = file_bunch.__dict__ else: dataset.composite_files[ key ] = None if not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) return [ dataset ] else: datasets = get_filenames( context[ self.name ][0] ) rval = [] for dataset in datasets: dataset.file_type = file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext( trans, context ) dataset.dbkey = dbkey rval.append( dataset ) return rval
dataset = UploadedDataset() dataset.file_type = file_type dataset.datatype = d_type dataset.dbkey = dbkey temp_name = None precreated_name = None is_multi_byte = False space_to_tab = False warnings = [] dataset_name = None dataset_info = None if dataset.datatype.composite_type == 'auto_primary_file': #replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file() ), prefix='upload_auto_primary_file' ) precreated_name = dataset_name = 'Uploaded Composite Dataset (%s)' % ( file_type ) else: temp_name, precreated_name, is_multi_byte, space_to_tab, dataset_name, dataset_info, warnings = get_one_filename( groups_incoming[ 0 ] ) writable_files_offset = 1 if temp_name is None:#remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) dataset.primary_file = temp_name dataset.is_multi_byte = is_multi_byte dataset.space_to_tab = space_to_tab dataset.precreated_name = precreated_name dataset.name = dataset_name dataset.info = dataset_info dataset.warnings.extend( warnings ) dataset.register_temp_file( temp_name )
def add_file( self, trans, folder_id, file_obj, name, file_format, dbkey, roles, info='no info', space_to_tab=False, replace_dataset=None, library_item_info_template=None, template_elements={}, message=None ): folder = trans.app.model.LibraryFolder.get( folder_id ) data_type = None line_count = 0 temp_name, is_multi_byte = sniff.stream_to_file( file_obj ) # See if we have an empty file if not os.path.getsize( temp_name ) > 0: raise BadFileException( "you attempted to upload an empty file." ) if is_multi_byte: ext = sniff.guess_ext( temp_name, is_multi_byte=True ) else: if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress on the fly. is_gzipped, is_valid = self.check_gzip( temp_name ) if is_gzipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( temp_name ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) raise BadFileException( 'problem uncompressing gzipped data.' ) if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file shutil.move( uncompressed, temp_name ) name = name.rstrip( '.gz' ) data_type = 'gzip' ext = '' if not data_type: # See if we have a zip archive is_zipped, is_valid, test_ext = self.check_zip( temp_name ) if is_zipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_zipped and is_valid: # Currently, we force specific tools to handle this case. We also require the user # to manually set the incoming file_format if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_format != 'binseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) elif test_ext == 'txt' and file_format != 'txtseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) if not ( file_format == 'binseq.zip' or file_format == 'txtseq.zip' ): raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) data_type = 'zip' ext = file_format if not data_type: if self.check_binary( temp_name ): try: ext = name.split( "." )[1].strip().lower() except: ext = '' if not( ext == 'ab1' or ext == 'scf' ): raise BadFileException( "you attempted to upload an inappropriate file." ) if ext == 'ab1' and file_format != 'ab1': raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) elif ext == 'scf' and file_format != 'scf': raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) data_type = 'binary' if not data_type: # We must have a text file if self.check_html( temp_name ): raise BadFileException( "you attempted to upload an inappropriate file." ) if data_type != 'binary' and data_type != 'zip': if space_to_tab: line_count = sniff.convert_newlines_sep2tabs( temp_name ) elif os.stat( temp_name ).st_size < 262144000: # 250MB line_count = sniff.convert_newlines( temp_name ) else: if sniff.check_newlines( temp_name ): line_count = sniff.convert_newlines( temp_name ) else: line_count = None if file_format == 'auto': ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: ext = file_format data_type = ext if info is None: info = 'uploaded %s file' % data_type if file_format == 'auto': data_type = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: data_type = file_format if replace_dataset: # The replace_dataset param ( when not None ) refers to a LibraryDataset that is being replaced with a new version. library_dataset = replace_dataset else: # If replace_dataset is None, the Library level permissions will be taken from the folder and applied to the new # LibraryDataset, and the current user's DefaultUserPermissions will be applied to the associated Dataset. library_dataset = trans.app.model.LibraryDataset( folder=folder, name=name, info=info ) library_dataset.flush() trans.app.security_agent.copy_library_permissions( folder, library_dataset ) ldda = trans.app.model.LibraryDatasetDatasetAssociation( name=name, info=info, extension=data_type, dbkey=dbkey, library_dataset=library_dataset, user=trans.get_user(), create_dataset=True ) ldda.message = message ldda.flush() # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset trans.app.security_agent.copy_library_permissions( library_dataset, ldda ) if replace_dataset: # Copy the Dataset level permissions from replace_dataset to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.copy_dataset_permissions( replace_dataset.library_dataset_dataset_association.dataset, ldda.dataset ) else: # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.set_all_dataset_permissions( ldda.dataset, trans.app.security_agent.user_get_default_permissions( trans.get_user() ) ) folder.add_library_dataset( library_dataset, genome_build=dbkey ) folder.flush() library_dataset.library_dataset_dataset_association_id = ldda.id library_dataset.flush() # Handle any templates included in the upload form if library_item_info_template: user = trans.get_user() library_item_info = trans.app.model.LibraryItemInfo( user=user ) library_item_info.library_item_info_template = library_item_info_template library_item_info.flush() trans.app.security_agent.copy_library_permissions( library_item_info_template, library_item_info ) for template_element in library_item_info_template.elements: info_element_value = template_elements.get( "info_element_%s_%s" % ( library_item_info_template.id, template_element.id ), None ) info_element = trans.app.model.LibraryItemInfoElement() info_element.contents = info_element_value info_element.library_item_info_template_element = template_element info_element.library_item_info = library_item_info info_element.flush() library_item_info_association = trans.app.model.LibraryDatasetDatasetInfoAssociation( user=user ) library_item_info_association.set_library_item( ldda ) library_item_info_association.library_item_info = library_item_info library_item_info_association.flush() # If roles were selected upon upload, restrict access to the Dataset to those roles if roles: for role in roles: dp = trans.app.model.DatasetPermissions( RBACAgent.permitted_actions.DATASET_ACCESS.action, ldda.dataset, role ) dp.flush() shutil.move( temp_name, ldda.dataset.file_name ) ldda.state = ldda.states.OK ldda.init_meta() if line_count: try: if is_multi_byte: ldda.set_multi_byte_peek( line_count=line_count ) else: ldda.set_peek( line_count=line_count ) except: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() else: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() ldda.set_size() if ldda.missing_meta(): ldda.datatype.set_meta( ldda ) ldda.flush() return ldda
def get_uploaded_datasets(self, trans, context, override_name=None, override_info=None): def get_data_file_filename(data_file, override_name=None, override_info=None, purge=True): dataset_name = override_name def get_file_name(file_name): file_name = file_name.split('\\')[-1] file_name = file_name.split('/')[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name(data_file['filename']) return Bunch(type='file', path=data_file['local_filename'], name=dataset_name, purge_source=purge) except Exception: # The uploaded file should've been persisted by the upload tool action return Bunch(type=None, path=None, name=None) def get_url_paste_urls_or_filename(group_incoming, override_name=None, override_info=None): url_paste_file = group_incoming.get('url_paste', None) if url_paste_file is not None: url_paste = open(url_paste_file, 'r').read() def start_of_url(content): start_of_url_paste = content.lstrip()[0:8].lower() looks_like_url = False for url_prefix in ["http://", "https://", "ftp://", "file://"]: if start_of_url_paste.startswith(url_prefix): looks_like_url = True break return looks_like_url if start_of_url(url_paste): url_paste = url_paste.replace('\r', '').split('\n') for line in url_paste: line = line.strip() if line: if not start_of_url(line): continue # non-url line, ignore if "file://" in line: if not trans.user_is_admin: raise AdminRequiredException() elif not trans.app.config.allow_path_paste: raise ConfigDoesNotAllowException() upload_path = line[len("file://"):] dataset_name = os.path.basename(upload_path) else: dataset_name = line if override_name: dataset_name = override_name yield Bunch(type='url', path=line, name=dataset_name) else: dataset_name = 'Pasted Entry' # we need to differentiate between various url pastes here if override_name: dataset_name = override_name yield Bunch(type='file', path=url_paste_file, name=dataset_name) def get_one_filename(context): data_file = context['file_data'] url_paste = context['url_paste'] ftp_files = context['ftp_files'] name = context.get('NAME', None) info = context.get('INFO', None) uuid = context.get('uuid', None) or None # Turn '' to None file_type = context.get('file_type', None) dbkey = self.get_dbkey(context) warnings = [] to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True auto_decompress = False if context.get('auto_decompress', None) not in ["None", None, False]: auto_decompress = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) if file_bunch.path: if url_paste is not None and url_paste.strip(): warnings.append("All file contents specified in the paste box were ignored.") if ftp_files: warnings.append("All FTP uploaded file selections were ignored.") elif url_paste is not None and url_paste.strip(): # we need to use url_paste for file_bunch in get_url_paste_urls_or_filename(context, override_name=name, override_info=info): if file_bunch.path: break if file_bunch.path and ftp_files is not None: warnings.append("All FTP uploaded file selections were ignored.") elif ftp_files is not None and trans.user is not None: # look for files uploaded via FTP user_ftp_dir = trans.user_ftp_dir assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): for filename in filenames: for ftp_filename in ftp_files: if ftp_filename == filename: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink(os.path.join(dirpath, filename)): ftp_data_file = {'local_filename' : os.path.abspath(os.path.join(user_ftp_dir, path)), 'filename' : os.path.basename(path)} purge = getattr(trans.app.config, 'ftp_upload_purge', True) file_bunch = get_data_file_filename( ftp_data_file, override_name=name, override_info=info, purge=purge, ) if file_bunch.path: break if file_bunch.path: break if file_bunch.path: break file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab file_bunch.uuid = uuid if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey return file_bunch, warnings def get_filenames(context): rval = [] data_file = context['file_data'] ftp_files = context['ftp_files'] uuid = context.get('uuid', None) or None # Turn '' to None name = context.get('NAME', None) info = context.get('INFO', None) file_type = context.get('file_type', None) dbkey = self.get_dbkey(context) to_posix_lines = False if context.get('to_posix_lines', None) not in ["None", None, False]: to_posix_lines = True auto_decompress = False if context.get('auto_decompress', None) not in ["None", None, False]: auto_decompress = True space_to_tab = False if context.get('space_to_tab', None) not in ["None", None, False]: space_to_tab = True file_bunch = get_data_file_filename(data_file, override_name=name, override_info=info) file_bunch.uuid = uuid if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) for file_bunch in get_url_paste_urls_or_filename(context, override_name=name, override_info=info): if file_bunch.path: file_bunch.uuid = uuid file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) # look for files uploaded via FTP valid_files = [] if ftp_files is not None: # Normalize input paths to ensure utf-8 encoding is normal form c. # This allows for comparison when the filesystem uses a different encoding than the browser. ftp_files = [unicodedata.normalize('NFC', f) for f in ftp_files if isinstance(f, text_type)] if trans.user is None: log.warning('Anonymous user passed values in ftp_files: %s' % ftp_files) ftp_files = [] # TODO: warning to the user (could happen if session has become invalid) else: user_ftp_dir = trans.user_ftp_dir assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): for filename in filenames: path = relpath(os.path.join(dirpath, filename), user_ftp_dir) if not os.path.islink(os.path.join(dirpath, filename)): # Normalize filesystem paths if isinstance(path, text_type): valid_files.append(unicodedata.normalize('NFC', path)) else: valid_files.append(path) else: ftp_files = [] for ftp_file in ftp_files: if ftp_file not in valid_files: log.warning('User passed an invalid file path in ftp_files: %s' % ftp_file) continue # TODO: warning to the user (could happen if file is already imported) ftp_data_file = {'local_filename' : os.path.abspath(os.path.join(user_ftp_dir, ftp_file)), 'filename' : os.path.basename(ftp_file)} purge = getattr(trans.app.config, 'ftp_upload_purge', True) file_bunch = get_data_file_filename(ftp_data_file, override_name=name, override_info=info, purge=purge) if file_bunch.path: file_bunch.to_posix_lines = to_posix_lines file_bunch.auto_decompress = auto_decompress file_bunch.space_to_tab = space_to_tab if file_type is not None: file_bunch.file_type = file_type if dbkey is not None: file_bunch.dbkey = dbkey rval.append(file_bunch) return rval file_type = self.get_file_type(context) file_count = self.get_file_count(trans, context) d_type = self.get_datatype(trans, context) dbkey = self.get_dbkey(context) tag_using_filenames = context.get('tag_using_filenames', False) force_composite = asbool(context.get('force_composite', 'False')) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [None for _ in range(file_count)] for group_incoming in context.get(self.name, []): i = int(group_incoming['__index__']) groups_incoming[i] = group_incoming if d_type.composite_type is not None or force_composite: # handle uploading of composite datatypes # Only one Dataset can be created dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} dataset.uuid = None dataset.tag_using_filenames = None # load metadata files_metadata = context.get(self.metadata_ref, {}) metadata_name_substition_default_dict = dict((composite_file.substitute_name_with_metadata, d_type.metadata_spec[composite_file.substitute_name_with_metadata].default) for composite_file in d_type.composite_files.values() if composite_file.substitute_name_with_metadata) for meta_name, meta_spec in d_type.metadata_spec.items(): if meta_spec.set_in_upload: if meta_name in files_metadata: meta_value = files_metadata[meta_name] if meta_name in metadata_name_substition_default_dict: meta_value = sanitize_for_filename(meta_value, default=metadata_name_substition_default_dict[meta_name]) dataset.metadata[meta_name] = meta_value dataset.name = self.get_composite_dataset_name(context) if dataset.datatype.composite_type == 'auto_primary_file': # replace sniff here with just creating an empty file temp_name = sniff.stream_to_file(StringIO(d_type.generate_primary_file(dataset)), prefix='upload_auto_primary_file') dataset.primary_file = temp_name dataset.to_posix_lines = True dataset.auto_decompress = True dataset.space_to_tab = False else: file_bunch, warnings = get_one_filename(groups_incoming[0]) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.to_posix_lines = file_bunch.to_posix_lines dataset.auto_decompress = file_bunch.auto_decompress dataset.space_to_tab = file_bunch.space_to_tab if file_bunch.file_type: dataset.file_type = file_type if file_bunch.dbkey: dataset.dbkey = dbkey dataset.warnings.extend(warnings) if dataset.primary_file is None: # remove this before finish, this should create an empty dataset raise Exception('No primary dataset file was available for composite upload') if not force_composite: keys = [value.name for value in writable_files.values()] else: keys = [str(index) for index in range(file_count)] for i, group_incoming in enumerate(groups_incoming[writable_files_offset:]): key = keys[i + writable_files_offset] if not force_composite and group_incoming is None and not writable_files[list(writable_files.keys())[keys.index(key)]].optional: dataset.warnings.append("A required composite file (%s) was not specified." % (key)) dataset.composite_files[key] = None else: file_bunch, warnings = get_one_filename(group_incoming) dataset.warnings.extend(warnings) if file_bunch.path: if force_composite: key = group_incoming.get("NAME") or i dataset.composite_files[key] = file_bunch.__dict__ elif not force_composite: dataset.composite_files[key] = None if not writable_files[list(writable_files.keys())[keys.index(key)]].optional: dataset.warnings.append("A required composite file (%s) was not specified." % (key)) return [dataset] else: rval = [] for i, file_contexts in enumerate(context[self.name]): datasets = get_filenames(file_contexts) for dataset in datasets: override_file_type = self.get_file_type(context[self.name][i], parent_context=context) d_type = self.get_datatype(trans, context[self.name][i], parent_context=context) dataset.file_type = override_file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext(trans, context[self.name][i], parent_context=context) dataset.dbkey = self.get_dbkey(context[self.name][i], parent_context=context) dataset.tag_using_filenames = tag_using_filenames rval.append(dataset) return rval
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def get_uploaded_datasets( self, trans, context, override_name = None, override_info = None ): def get_data_file_filename( data_file, override_name = None, override_info = None ): dataset_name = override_name dataset_info = override_info def get_file_name( file_name ): file_name = file_name.split( '\\' )[-1] file_name = file_name.split( '/' )[-1] return file_name try: # Use the existing file if not dataset_name and 'filename' in data_file: dataset_name = get_file_name( data_file['filename'] ) if not dataset_info: dataset_info = 'uploaded file' return Bunch( type='file', path=data_file['local_filename'], name=get_file_name( data_file['filename'] ) ) #return 'file', data_file['local_filename'], get_file_name( data_file.filename ), dataset_name, dataset_info except: # The uploaded file should've been persisted by the upload tool action return Bunch( type=None, path=None, name=None ) #return None, None, None, None, None def get_url_paste_urls_or_filename( group_incoming, override_name = None, override_info = None ): filenames = [] url_paste_file = group_incoming.get( 'url_paste', None ) if url_paste_file is not None: url_paste = open( url_paste_file, 'r' ).read( 1024 ) if url_paste.lstrip().lower().startswith( 'http://' ) or url_paste.lstrip().lower().startswith( 'ftp://' ): url_paste = url_paste.replace( '\r', '' ).split( '\n' ) for line in url_paste: line = line.strip() if line: if not line.lower().startswith( 'http://' ) and not line.lower().startswith( 'ftp://' ): continue # non-url line, ignore precreated_name = line dataset_name = override_name if not dataset_name: dataset_name = line dataset_info = override_info if not dataset_info: dataset_info = 'uploaded url' yield Bunch( type='url', path=line, name=precreated_name ) #yield ( 'url', line, precreated_name, dataset_name, dataset_info ) else: dataset_name = dataset_info = precreated_name = 'Pasted Entry' #we need to differentiate between various url pastes here if override_name: dataset_name = override_name if override_info: dataset_info = override_info yield Bunch( type='file', path=url_paste_file, name=precreated_name ) #yield ( 'file', url_paste_file, precreated_name, dataset_name, dataset_info ) def get_one_filename( context ): data_file = context['file_data'] url_paste = context['url_paste'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) warnings = [] space_to_tab = False if context.get( 'space_to_tab', None ) not in ["None", None]: space_to_tab = True file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info ) if file_bunch.path and url_paste: if url_paste.strip(): warnings.append( "All file contents specified in the paste box were ignored." ) else: #we need to use url_paste for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ): if file_bunch.path: break return file_bunch, warnings def get_filenames( context ): rval = [] data_file = context['file_data'] url_paste = context['url_paste'] name = context.get( 'NAME', None ) info = context.get( 'INFO', None ) space_to_tab = False if context.get( 'space_to_tab', None ) not in ["None", None]: space_to_tab = True warnings = [] file_bunch = get_data_file_filename( data_file, override_name = name, override_info = info ) if file_bunch.path: file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) for file_bunch in get_url_paste_urls_or_filename( context, override_name = name, override_info = info ): if file_bunch.path: file_bunch.space_to_tab = space_to_tab rval.append( file_bunch ) return rval file_type = self.get_file_type( context ) d_type = self.get_datatype( trans, context ) dbkey = context.get( 'dbkey', None ) writable_files = d_type.writable_files writable_files_offset = 0 groups_incoming = [ None for filename in writable_files ] for group_incoming in context.get( self.name, [] ): i = int( group_incoming['__index__'] ) groups_incoming[ i ] = group_incoming if d_type.composite_type is not None: #handle uploading of composite datatypes #Only one Dataset can be created dataset = Bunch() dataset.type = 'composite' dataset.file_type = file_type dataset.dbkey = dbkey dataset.datatype = d_type dataset.warnings = [] dataset.metadata = {} dataset.composite_files = {} #load metadata files_metadata = context.get( self.metadata_ref, {} ) for meta_name, meta_spec in d_type.metadata_spec.iteritems(): if meta_spec.set_in_upload: if meta_name in files_metadata: dataset.metadata[ meta_name ] = files_metadata[ meta_name ] dataset_name = None dataset_info = None if dataset.datatype.composite_type == 'auto_primary_file': #replace sniff here with just creating an empty file temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file() ), prefix='upload_auto_primary_file' ) dataset.primary_file = temp_name dataset.space_to_tab = False dataset.precreated_name = dataset.name = 'Uploaded Composite Dataset (%s)' % ( file_type ) else: file_bunch, warnings = get_one_filename( groups_incoming[ 0 ] ) if dataset.datatype.composite_type: precreated_name = 'Uploaded Composite Dataset (%s)' % ( file_type ) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.space_to_tab = file_bunch.space_to_tab dataset.precreated_name = file_bunch.precreated_name dataset.name = file_bunch.precreated_name dataset.warnings.extend( file_bunch.warnings ) if dataset.primary_file is None:#remove this before finish, this should create an empty dataset raise Exception( 'No primary dataset file was available for composite upload' ) keys = [ value.name for value in writable_files.values() ] for i, group_incoming in enumerate( groups_incoming[ writable_files_offset : ] ): key = keys[ i + writable_files_offset ] if group_incoming is None and not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) dataset.composite_files[ key ] = None else: file_bunch, warnings = get_one_filename( group_incoming ) if file_bunch.path: dataset.composite_files[ key ] = file_bunch.__dict__ else: dataset.composite_files[ key ] = None if not writable_files[ writable_files.keys()[ keys.index( key ) ] ].optional: dataset.warnings.append( "A required composite file (%s) was not specified." % ( key ) ) return [ dataset ] else: datasets = get_filenames( context[ self.name ][0] ) rval = [] for dataset in datasets: dataset.file_type = file_type dataset.datatype = d_type dataset.ext = self.get_datatype_ext( trans, context ) dataset.dbkey = dbkey rval.append( dataset ) return rval