def __init__( self, datatypes_registry = None, ext = 'data', dbkey = '?' ): self.ext = self.extension = ext self.dbkey = dbkey if datatypes_registry is None: datatypes_registry = Registry() self.datatype = datatypes_registry.get_datatype_by_extension( ext ) self._metadata = None self.metadata = MetadataCollection( self )
def __init__(self, config): self.object_store = build_object_store_from_config(config) # Setup the database engine and ORM self.model = galaxy.config.init_models_from_config(config, object_store=self.object_store) registry = Registry() registry.load_datatypes() galaxy.model.set_datatypes_registry(registry)
def __main__(): if len(sys.argv) < 4: print('usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...', file=sys.stderr) sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) try: datasets = __read_paramfile(sys.argv[3]) except (ValueError, AssertionError): datasets = __read_old_paramfile(sys.argv[3]) metadata = [] for dataset in datasets: dataset = bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except Exception: print('Output path for dataset %s not found on command line' % dataset.dataset_id, file=sys.stderr) sys.exit(1) try: if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] metadata.append(add_composite_file(dataset, registry, output_path, files_path)) else: metadata.append(add_file(dataset, registry, output_path)) except UploadProblemException as e: metadata.append(file_err(unicodify(e), dataset)) __write_job_metadata(metadata)
def __main__(): if len(sys.argv) < 4: print >> sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) json_file = open('galaxy.json', 'w') registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) for line in open(sys.argv[3], 'r'): dataset = from_json_string(line) dataset = util.bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except: print >> sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id sys.exit(1) if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] add_composite_file(dataset, registry, json_file, output_path, files_path) else: add_file(dataset, registry, json_file, output_path) # clean up paramfile # TODO: this will not work when running as the actual user unless the # parent directory is writable by the user. try: os.remove(sys.argv[3]) except: pass
def __main__(): if len(sys.argv) < 4: print('usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...', file=sys.stderr) sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) try: datasets = __read_paramfile(sys.argv[3]) except (ValueError, AssertionError): datasets = __read_old_paramfile(sys.argv[3]) metadata = [] for dataset in datasets: dataset = bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except Exception: print('Output path for dataset %s not found on command line' % dataset.dataset_id, file=sys.stderr) sys.exit(1) try: if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] metadata.append(add_composite_file(dataset, output_path, files_path)) else: metadata.append(add_file(dataset, registry, output_path)) except UploadProblemException as e: metadata.append(file_err(e.message, dataset)) __write_job_metadata(metadata)
def __main__(): if len(sys.argv) < 4: print('usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...', file=sys.stderr) sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) json_file = open('galaxy.json', 'w') registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) for line in open(sys.argv[3], 'r'): dataset = loads(line) dataset = util.bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except: print('Output path for dataset %s not found on command line' % dataset.dataset_id, file=sys.stderr) sys.exit(1) if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] add_composite_file(dataset, json_file, output_path, files_path) else: add_file(dataset, registry, json_file, output_path) # clean up paramfile # TODO: this will not work when running as the actual user unless the # parent directory is writable by the user. try: os.remove(sys.argv[3]) except: pass
def __main__(): filename = sys.argv[1] try: max_file_size = int( sys.argv[2] ) except: max_file_size = 0 job_params, params = load_input_parameters( filename ) if job_params is None: #using an older tabular file enhanced_handling = False job_params = dict( param_dict = params ) job_params[ 'output_data' ] = [ dict( out_data_name = 'output', ext = 'data', file_name = filename, files_path = None ) ] job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE ) else: enhanced_handling = True json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get( 'URL_method', None ) simpleD = params.get('galaxyData') # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout( 600 ) cur_filename = params.get('output') outputfile = open( cur_filename, 'w' ).write( simpleD )
def __init__(self, datatypes_registry=None, ext='data', dbkey='?'): self.ext = self.extension = ext self.dbkey = dbkey if datatypes_registry is None: datatypes_registry = Registry() self.datatype = datatypes_registry.get_datatype_by_extension(ext) self._metadata = None self.metadata = MetadataCollection(self)
def __main__(): filename = sys.argv[1] try: max_file_size = int( sys.argv[2] ) except: max_file_size = 0 job_params, params = load_input_parameters( filename ) if job_params is None: #using an older tabular file enhanced_handling = False job_params = dict( param_dict = params ) job_params[ 'output_data' ] = [ dict( out_data_name = 'output', ext = 'data', file_name = filename, extra_files_path = None ) ] job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE ) else: enhanced_handling = True json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get( 'URL_method', None ) # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout( 600 ) for data_dict in job_params[ 'output_data' ]: cur_filename = data_dict.get( 'file_name', filename ) cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL ) if not cur_URL: open( cur_filename, 'w' ).write( "" ) stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) # The following calls to urllib.urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urllib.urlopen( cur_URL ) elif URL_method == 'post': page = urllib.urlopen( cur_URL, urllib.urlencode( params ) ) except Exception, e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) if max_file_size: file_size = int( page.info().get( 'Content-Length', 0 ) ) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) #do sniff stream for multi_byte try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception, e: stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )
def __init__(self, config): self.object_store = build_object_store_from_config(config) # Setup the database engine and ORM self.model = galaxy.config.init_models_from_config( config, object_store=self.object_store) registry = Registry() registry.load_datatypes() galaxy.model.set_datatypes_registry(registry)
def collect_test_data(): registry = Registry() registry.load_datatypes(root_dir=GALAXY_ROOT, config=DATATYPES_CONFIG) test_files = os.listdir(TEST_FILE_DIR) files = [os.path.join(TEST_FILE_DIR, f) for f in test_files] datatypes = [find_datatype(registry, f) for f in test_files] uploadable = [datatype.file_ext in registry.upload_file_formats for datatype in datatypes] test_data_description = [TEST_DATA(*items) for items in zip(files, datatypes, uploadable)] return {os.path.basename(data.path): data for data in test_data_description}
def __init__(self, config): if config.database_connection is False: config.database_connection = "sqlite:///%s?isolation_level=IMMEDIATE" % config.database self.object_store = build_object_store_from_config(config) # Setup the database engine and ORM self.model = galaxy.model.mapping.init(config.file_path, config.database_connection, engine_options={}, create_tables=False, object_store=self.object_store) registry = Registry() registry.load_datatypes() galaxy.model.set_datatypes_registry(registry)
def __init__( self, datatypes_registry=None, ext='data', dbkey='?' ): self.ext = self.extension = ext self.dbkey = dbkey if datatypes_registry is None: # Default Value Required for unit tests datatypes_registry = Registry() datatypes_registry.load_datatypes() self.datatype = datatypes_registry.get_datatype_by_extension( ext ) self._metadata = None self.metadata = MetadataCollection( self )
def __init__(self, app_name, security, model, tool_data_path, shed_tool_data_path, tool_data_tables=None, registry=None, hgweb_config_manager=None): self.name = app_name self.security = security self.model = model self.config = Bunch() self.config.tool_data_path = tool_data_path self.config.shed_tool_data_path = shed_tool_data_path self.temporary_path = tempfile.mkdtemp(prefix='tool_validation_') self.config.tool_data_table_config = os.path.join( self.temporary_path, 'tool_data_table_conf.xml') self.config.shed_tool_data_table_config = os.path.join( self.temporary_path, 'shed_tool_data_table_conf.xml') self.tool_data_tables = tool_data_tables self.datatypes_registry = registry or Registry() self.hgweb_config_manager = hgweb_config_manager self.config.len_file_path = os.path.join(self.temporary_path, 'chromlen.txt') # If the builds file path is set to None, tools/__init__.py will load the default. # Otherwise it will attempt to load a nonexistent file and log an error. This does # not appear to be an issue with the len_file_path config option. self.config.builds_file_path = None self.genome_builds = GenomeBuilds(self)
def __main__(): if len( sys.argv ) < 4: print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' sys.exit( 1 ) output_paths = parse_outputs( sys.argv[4:] ) json_file = open( 'galaxy.json', 'w' ) registry = Registry( sys.argv[1], sys.argv[2] ) for line in open( sys.argv[3], 'r' ): dataset = from_json_string( line ) dataset = util.bunch.Bunch( **safe_dict( dataset ) ) try: output_path = output_paths[int( dataset.dataset_id )][0] except: print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id sys.exit( 1 ) if dataset.type == 'composite': files_path = output_paths[int( dataset.dataset_id )][1] add_composite_file( dataset, registry, json_file, output_path, files_path ) else: add_file( dataset, registry, json_file, output_path ) # clean up paramfile try: os.remove( sys.argv[1] ) except: pass
def sniff_and_handle_data_type(json_params, output_file): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ try: datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_type = sniff.handle_uploaded_dataset_file(output_file, datatypes_registry) return file_type except Exception: return None
def get_converters_for_collection(self, trans, id, datatypes_registry: Registry, instance_type="history"): dataset_collection_instance = self.get_dataset_collection_instance( trans, id=id, instance_type=instance_type, check_ownership=True ) dbkeys_and_extensions = dataset_collection_instance.dataset_dbkeys_and_extensions_summary suitable_converters = set() first_extension = True most_recent_datatype = None # TODO error checking for datatype in dbkeys_and_extensions[1]: new_converters = datatypes_registry.get_converters_by_datatype(datatype) set_of_new_converters = set() for tgt_type, tgt_val in new_converters.items(): converter = (tgt_type, tgt_val) set_of_new_converters.add(converter) if (first_extension is True): suitable_converters = set_of_new_converters most_recent_datatype = datatype first_extension = False else: suitable_converters = suitable_converters.intersection(set_of_new_converters) if suitable_converters: most_recent_datatype = datatype suitable_tool_ids = list() for tool in suitable_converters: tool_info = {"tool_id": tool[1].id, "name": tool[1].name, "target_type": tool[0], "original_type": most_recent_datatype} suitable_tool_ids.append(tool_info) return suitable_tool_ids
def __main__(): filename = sys.argv[1] try: int(sys.argv[2]) except Exception: pass job_params, params = load_input_parameters(filename) if job_params is None: # using an older tabular file job_params = dict(param_dict=params) job_params["output_data"] = [ dict(out_data_name="output", ext="data", file_name=filename, files_path=None) ] job_params["job_config"] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE, ) # else: # json_file = open( # job_params["job_config"]["TOOL_PROVIDED_JOB_METADATA_FILE"], "w" # ) # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params["job_config"]["GALAXY_ROOT_DIR"], config=job_params["job_config"]["GALAXY_DATATYPES_CONF_FILE"], ) # URL = params.get( # "URL", None # ) # using exactly URL indicates that only one dataset is being downloaded params.get("URL_method", None) simpleD = params.get("galaxyData") # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout(600) cur_filename = params.get("output") open(cur_filename, "w").write(simpleD)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = _arg_parser().parse_args(argv) registry = Registry() registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) request_path = args.request assert os.path.exists(request_path) with open(request_path) as f: request = json.load(f) upload_config = UploadConfig(request, registry) galaxy_json = _request_to_galaxy_json(upload_config, request) with open("galaxy.json", "w") as f: json.dump(galaxy_json, f)
def __main__(): filename = sys.argv[1] try: max_file_size = int(sys.argv[2]) except: max_file_size = 0 job_params, params = load_input_parameters(filename) if job_params is None: #using an older tabular file enhanced_handling = False job_params = dict(param_dict=params) job_params['output_data'] = [ dict(out_data_name='output', ext='data', file_name=filename, extra_files_path=None) ] job_params['job_config'] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) else: enhanced_handling = True json_file = open( job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w' ) #specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get('URL_method', None) simpleD = params.get('galaxyData') # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout(600) cur_filename = params.get('output') outputfile = open(cur_filename, 'w').write(simpleD)
def sniff_and_handle_data_type(json_params, output_file): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ try: datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_type = sniff.handle_uploaded_dataset_file( output_file, datatypes_registry) return file_type except: return None
def main(argv=None): if argv is None: argv = sys.argv[1:] args = _arg_parser().parse_args(argv) registry = Registry() registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) request_path = args.request assert os.path.exists(request_path) with open(request_path) as f: request = json.load(f) working_directory = args.working_directory or os.getcwd() allow_failed_collections = request.get("allow_failed_collections", False) upload_config = UploadConfig(request, registry, working_directory, allow_failed_collections) galaxy_json = _request_to_galaxy_json(upload_config, request) galaxy_json_path = os.path.join(working_directory, "galaxy.json") with open(galaxy_json_path, "w") as f: json.dump(galaxy_json, f)
def _configure_datatypes_registry(self, installed_repository_manager=None): # Create an empty datatypes registry. self.datatypes_registry = Registry(self.config) if installed_repository_manager and self.config.load_tool_shed_datatypes: # Load proprietary datatypes defined in datatypes_conf.xml files in all installed tool shed repositories. We # load proprietary datatypes before datatypes in the distribution because Galaxy's default sniffers include some # generic sniffers (eg text,xml) which catch anything, so it's impossible for proprietary sniffers to be used. # However, if there is a conflict (2 datatypes with the same extension) between a proprietary datatype and a datatype # in the Galaxy distribution, the datatype in the Galaxy distribution will take precedence. If there is a conflict # between 2 proprietary datatypes, the datatype from the repository that was installed earliest will take precedence. installed_repository_manager.load_proprietary_datatypes() # Load the data types in the Galaxy distribution, which are defined in self.config.datatypes_config. datatypes_configs = self.config.datatypes_config for datatypes_config in listify(datatypes_configs): # Setting override=False would make earlier files would take # precedence - but then they wouldn't override tool shed # datatypes. self.datatypes_registry.load_datatypes(self.config.root, datatypes_config, override=True)
def download_from_genomespace_importer(username, token, json_parameter_file, genomespace_site, gs_toolname): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output_file1", None) dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token, gs_toolname=gs_toolname) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) url_param = datasource_params.get(file_url_name, None) used_filenames = [] for download_url in url_param.split(','): using_temp_file = False parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) #write file to disk new_file_request = urllib2.Request(download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url if output_filename is None: #need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-').name output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() #determine file format file_type = None if 'dataformat' in query_params: #this is a converted dataset file_type = query_params['dataformat'][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type) else: try: #get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % (genomespace_site_dict['dmServer']), 1 )[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % (genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path)) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open(metadata_request) file_metadata_dict = json.loads(metadata_url.read()) metadata_url.close() file_type = file_metadata_dict.get('dataFormat', None) if file_type and file_type.get('url'): file_type = file_type.get('url') file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None) except: pass if file_type is None: #try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: pass #sniff failed if file_type is None and '.' in parsed_url[2]: #still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit('.', 1)[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get(file_type, file_type) if file_type is None: #use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT #save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write("%s\n" % json.dumps( dict(type='dataset', dataset_id=dataset_id, ext=file_type, name="GenomeSpace importer on %s" % (filename)))) #if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: original_filename = filename filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) target_output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, file_type)) shutil.move(output_filename, target_output_filename) metadata_parameter_file.write("%s\n" % json.dumps( dict(type='new_primary_dataset', base_dataset_id=base_dataset_id, ext=file_type, filename=target_output_filename, name="GenomeSpace importer on %s" % (original_filename)))) dataset_id = None #only one primary dataset available output_filename = None #only have one filename available metadata_parameter_file.close() return True
from sqlalchemy.orm import * # noqa from sqlalchemy.exc import * # noqa from sqlalchemy.sql import label # noqa sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'lib'))) from galaxy.datatypes.registry import Registry from galaxy.model import * # noqa from galaxy.model import set_datatypes_registry # More explicit than `*` import from galaxy.model.mapping import init from galaxy.model.orm.scripts import get_config if sys.version_info > (3,): long = int registry = Registry() registry.load_datatypes() set_datatypes_registry(registry) db_url = get_config(sys.argv)['db_url'] sa_session = init('/tmp/', db_url).context # Helper function for debugging sqlalchemy queries... # http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query def printquery(statement, bind=None): """ Print a query, with values filled in for debugging purposes *only* for security, you should always separate queries from their values please also note that this function is quite slow """
uploadable = [datatype.file_ext in registry.upload_file_formats for datatype in datatypes] test_data_description = [TestData(*items) for items in zip(files, datatypes, uploadable)] return {os.path.basename(data.path): data for data in test_data_description} class UploadTestDatatypeDataTestCase(BaseUploadContentConfigurationInstance): framework_tool_and_types = False datatypes_conf_override = DATATYPES_CONFIG object_store_config = None object_store_config_path = None instance = integration_util.integration_module_instance(UploadTestDatatypeDataTestCase) registry = Registry() registry.load_datatypes(root_dir=GALAXY_ROOT, config=DATATYPES_CONFIG) TEST_CASES = collect_test_data(registry) @pytest.mark.parametrize('test_data', TEST_CASES.values(), ids=list(TEST_CASES.keys())) def test_upload_datatype_auto(instance, test_data, temp_file): upload_datatype_helper(instance, test_data, temp_file) def upload_datatype_helper(instance, test_data, temp_file): is_compressed = False for is_method in (is_bz2, is_gzip, is_zip): is_compressed = is_method(test_data.path) if is_compressed: break
def download_from_genomespace_file_browser(json_parameter_file, genomespace_site): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') username = datasource_params.get("gs-username", None) token = datasource_params.get("gs-token", None) assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output", None) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_numbers = [] for name in datasource_params.keys(): if name.startswith(file_url_prefix): name = name[len(file_url_prefix):] file_numbers.append(int(name)) if not file_numbers: if output_filename: open(output_filename, 'wb') #erase contents of file raise Exception( "You must select at least one file to import into Galaxy.") file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % (file_url_prefix, file_num) download_url = datasource_params.get(url_key, None) if download_url is None: break filetype_key = "%s%i" % (file_type_prefix, file_num) filetype_url = datasource_params.get(filetype_key, None) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url) formated_download_url = "%s?%s" % ( download_url, urllib.urlencode([('dataformat', filetype_url)])) new_file_request = urllib2.Request(formated_download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, galaxy_ext)) metadata_dict = dict(type='new_primary_dataset', base_dataset_id=dataset_id, ext=galaxy_ext, filename=output_filename, name="GenomeSpace import on %s" % (original_filename)) else: if dataset_id is not None: metadata_dict = dict(type='dataset', dataset_id=dataset_id, ext=galaxy_ext, name="GenomeSpace import on %s" % (filename)) output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() if (galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN) and metadata_dict: #try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: #sniff failed galaxy_ext = original_filename.rsplit('.', 1)[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict['ext'] = galaxy_ext output_filename = None #only have one filename available #write out metadata info if metadata_dict: metadata_parameter_file.write("%s\n" % json.dumps(metadata_dict)) metadata_parameter_file.close() return True
def download_from_genomespace_importer( username, token, json_parameter_file, genomespace_site ): json_params = simplejson.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output_file1", None ) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token ) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) url_param = datasource_params.get( file_url_name, None ) for download_url in url_param.split( ',' ): using_temp_file = False parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) #write file to disk new_file_request = urllib2.Request( download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] ) if output_filename is None: #need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-' ).name output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() #determine file format file_type = None if 'dataformat' in query_params: #this is a converted dataset file_type = query_params[ 'dataformat' ][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type ) else: try: #get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % ( genomespace_site_dict['dmServer'] ), 1)[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % ( genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path ) ) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open( metadata_request ) file_metadata_dict = simplejson.loads( metadata_url.read() ) metadata_url.close() file_type = file_metadata_dict.get( 'dataFormat', None ) if file_type and file_type.get( 'url' ): file_type = file_type.get( 'url' ) file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default = None ) except: pass if file_type is None: #try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: pass #sniff failed if file_type is None and '.' in parsed_url[2]: #still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit( '.', 1 )[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get( file_type, file_type ) if file_type is None: #use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT #save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write( "%s\n" % simplejson.dumps( dict( type = 'dataset', dataset_id = dataset_id, ext = file_type, name = "GenomeSpace importer on %s" % ( filename ) ) ) ) #if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: shutil.move( output_filename, os.path.join( datasource_params['__new_file_path__'], 'primary_%i_output%s_visible_%s' % ( hda_id, ''.join( c in VALID_CHARS and c or '-' for c in filename ), file_type ) ) ) dataset_id = None #only one primary dataset available output_filename = None #only have one filename available metadata_parameter_file.close() return True
def __main__(): filename = sys.argv[1] try: max_file_size = int(sys.argv[2]) except Exception: max_file_size = 0 job_params, params = load_input_parameters(filename) if job_params is None: # using an older tabular file enhanced_handling = False job_params = dict(param_dict=params) job_params['output_data'] = [ dict(out_data_name='output', ext='data', file_name=filename, extra_files_path=None) ] job_params['job_config'] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) else: enhanced_handling = True json_file = open( job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w' ) # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) URL = params.get( 'URL', None ) # using exactly URL indicates that only one dataset is being downloaded URL_method = params.get('URL_method', None) for data_dict in job_params['output_data']: cur_filename = data_dict.get('file_name', filename) cur_URL = params.get( '%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL) if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'): open(cur_filename, 'w').write("") stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) # The following calls to urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urlopen(cur_URL, timeout=DEFAULT_SOCKET_TIMEOUT) elif URL_method == 'post': page = urlopen(cur_URL, urlencode(params).encode("utf-8"), timeout=DEFAULT_SOCKET_TIMEOUT) except Exception as e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str(e)) if max_file_size: file_size = int(page.info().get('Content-Length', 0)) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size)) try: cur_filename = sniff.stream_to_open_named_file( page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers)) except Exception as e: stop_err('Unable to fetch %s:\n%s' % (cur_URL, e)) # here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext']) except Exception as e: stop_err(str(e)) info = dict(type='dataset', dataset_id=data_dict['dataset_id'], ext=ext) json_file.write("%s\n" % dumps(info))
def load_datatypes_registry(job_params): datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) return datatypes_registry
class ConfiguresGalaxyMixin: """Shared code for configuring Galaxy-like app objects.""" config: config.GalaxyAppConfiguration tool_cache: ToolCache job_config: jobs.JobConfiguration toolbox: tools.ToolBox toolbox_search: ToolBoxSearch container_finder: containers.ContainerFinder def _configure_genome_builds(self, data_table_name="__dbkeys__", load_old_style=True): self.genome_builds = GenomeBuilds(self, data_table_name=data_table_name, load_old_style=load_old_style) def wait_for_toolbox_reload(self, old_toolbox): timer = ExecutionTimer() log.debug('Waiting for toolbox reload') # Wait till toolbox reload has been triggered (or more than 60 seconds have passed) while timer.elapsed < 60: if self.toolbox.has_reloaded(old_toolbox): log.debug('Finished waiting for toolbox reload %s', timer) break time.sleep(0.1) else: log.warning( 'Waiting for toolbox reload timed out after 60 seconds') def _configure_tool_config_files(self): if self.config.shed_tool_config_file not in self.config.tool_configs: self.config.tool_configs.append(self.config.shed_tool_config_file) # The value of migrated_tools_config is the file reserved for containing only those tools that have been # eliminated from the distribution and moved to the tool shed. If migration checking is disabled, only add it if # it exists (since this may be an existing deployment where migrations were previously run). if (os.path.exists(self.config.migrated_tools_config) and self.config.migrated_tools_config not in self.config.tool_configs): self.config.tool_configs.append(self.config.migrated_tools_config) def _configure_toolbox(self): if not isinstance(self, BasicSharedApp): raise Exception("Must inherit from BasicSharedApp") self.citations_manager = CitationsManager(self) self.biotools_metadata_source = get_galaxy_biotools_metadata_source( self.config) self.dynamic_tools_manager = DynamicToolManager(self) self._toolbox_lock = threading.RLock() self.toolbox = tools.ToolBox(self.config.tool_configs, self.config.tool_path, self) galaxy_root_dir = os.path.abspath(self.config.root) file_path = os.path.abspath(self.config.file_path) app_info = AppInfo( galaxy_root_dir=galaxy_root_dir, default_file_path=file_path, tool_data_path=self.config.tool_data_path, shed_tool_data_path=self.config.shed_tool_data_path, outputs_to_working_directory=self.config. outputs_to_working_directory, container_image_cache_path=self.config.container_image_cache_path, library_import_dir=self.config.library_import_dir, enable_mulled_containers=self.config.enable_mulled_containers, container_resolvers_config_file=self.config. container_resolvers_config_file, container_resolvers_config_dict=self.config.container_resolvers, involucro_path=self.config.involucro_path, involucro_auto_init=self.config.involucro_auto_init, mulled_channels=self.config.mulled_channels, ) mulled_resolution_cache = None if self.config.mulled_resolution_cache_type: cache_opts = { "cache.type": self.config.mulled_resolution_cache_type, "cache.data_dir": self.config.mulled_resolution_cache_data_dir, "cache.lock_dir": self.config.mulled_resolution_cache_lock_dir, "cache.expire": self.config.mulled_resolution_cache_expire, } mulled_resolution_cache = CacheManager( **parse_cache_config_options(cache_opts)).get_cache( 'mulled_resolution') self.container_finder = containers.ContainerFinder( app_info, mulled_resolution_cache=mulled_resolution_cache) self._set_enabled_container_types() index_help = getattr(self.config, "index_tool_help", True) self.toolbox_search = ToolBoxSearch( self.toolbox, index_dir=self.config.tool_search_index_dir, index_help=index_help) def reindex_tool_search(self): # Call this when tools are added or removed. self.toolbox_search.build_index(tool_cache=self.tool_cache, toolbox=self.toolbox) self.tool_cache.reset_status() def _set_enabled_container_types(self): container_types_to_destinations = collections.defaultdict(list) for destinations in self.job_config.destinations.values(): for destination in destinations: for enabled_container_type in self.container_finder._enabled_container_types( destination.params): container_types_to_destinations[ enabled_container_type].append(destination) self.toolbox.dependency_manager.set_enabled_container_types( container_types_to_destinations) self.toolbox.dependency_manager.resolver_classes.update( self.container_finder.default_container_registry.resolver_classes) self.toolbox.dependency_manager.dependency_resolvers.extend( self.container_finder.default_container_registry. container_resolvers) def _configure_tool_data_tables(self, from_shed_config): # Initialize tool data tables using the config defined by self.config.tool_data_table_config_path. self.tool_data_tables = ToolDataTableManager( tool_data_path=self.config.tool_data_path, config_filename=self.config.tool_data_table_config_path, other_config_dict=self.config) # Load additional entries defined by self.config.shed_tool_data_table_config into tool data tables. try: self.tool_data_tables.load_from_config_file( config_filename=self.config.shed_tool_data_table_config, tool_data_path=self.tool_data_tables.tool_data_path, from_shed_config=from_shed_config) except OSError as exc: # Missing shed_tool_data_table_config is okay if it's the default if exc.errno != errno.ENOENT or self.config.is_set( 'shed_tool_data_table_config'): raise def _configure_datatypes_registry(self, installed_repository_manager=None): # Create an empty datatypes registry. self.datatypes_registry = Registry(self.config) if installed_repository_manager and self.config.load_tool_shed_datatypes: # Load proprietary datatypes defined in datatypes_conf.xml files in all installed tool shed repositories. We # load proprietary datatypes before datatypes in the distribution because Galaxy's default sniffers include some # generic sniffers (eg text,xml) which catch anything, so it's impossible for proprietary sniffers to be used. # However, if there is a conflict (2 datatypes with the same extension) between a proprietary datatype and a datatype # in the Galaxy distribution, the datatype in the Galaxy distribution will take precedence. If there is a conflict # between 2 proprietary datatypes, the datatype from the repository that was installed earliest will take precedence. installed_repository_manager.load_proprietary_datatypes() # Load the data types in the Galaxy distribution, which are defined in self.config.datatypes_config. datatypes_configs = self.config.datatypes_config for datatypes_config in listify(datatypes_configs): # Setting override=False would make earlier files would take # precedence - but then they wouldn't override tool shed # datatypes. self.datatypes_registry.load_datatypes(self.config.root, datatypes_config, override=True) def _configure_object_store(self, **kwds): self.object_store = build_object_store_from_config(self.config, **kwds) def _configure_security(self): self.security = IdEncodingHelper(id_secret=self.config.id_secret) BaseDatabaseIdField.security = self.security def _configure_tool_shed_registry(self): # Set up the tool sheds registry if os.path.isfile(self.config.tool_sheds_config_file): self.tool_shed_registry = tool_shed_registry.Registry( self.config.tool_sheds_config_file) else: self.tool_shed_registry = tool_shed_registry.Registry() def _configure_models(self, check_migrate_databases=False, config_file=None): """Preconditions: object_store must be set on self.""" db_url = self.config.database_connection install_db_url = self.config.install_database_connection # TODO: Consider more aggressive check here that this is not the same # database file under the hood. combined_install_database = not (install_db_url and install_db_url != db_url) install_db_url = install_db_url or db_url install_database_options = self.config.database_engine_options if combined_install_database else self.config.install_database_engine_options if self.config.database_wait: self._wait_for_database(db_url) if getattr(self.config, "max_metadata_value_size", None): custom_types.MAX_METADATA_VALUE_SIZE = self.config.max_metadata_value_size if check_migrate_databases: # Initialize database / check for appropriate schema version. # If this # is a new installation, we'll restrict the tool migration messaging. create_or_verify_database( db_url, config_file, self.config.database_engine_options, app=self, map_install_models=combined_install_database) if not combined_install_database: tsi_create_or_verify_database(install_db_url, install_database_options, app=self) self.model = init_models_from_config( self.config, map_install_models=combined_install_database, object_store=self.object_store, trace_logger=getattr(self, "trace_logger", None)) if combined_install_database: log.info( "Install database targetting Galaxy's database configuration.") self.install_model = self.model else: install_db_url = self.config.install_database_connection log.info( f"Install database using its own connection {install_db_url}") self.install_model = install_mapping.init( install_db_url, install_database_options) def _configure_signal_handlers(self, handlers): for sig, handler in handlers.items(): signal.signal(sig, handler) def _wait_for_database(self, url): attempts = self.config.database_wait_attempts pause = self.config.database_wait_sleep for i in range(1, attempts): try: database_exists(url) break except Exception: log.info("Waiting for database: attempt %d of %d" % (i, attempts)) time.sleep(pause) @property def tool_dependency_dir(self): return self.toolbox.dependency_manager.default_base_path
def download_from_genomespace_file_browser( json_parameter_file, genomespace_site ): json_params = json.loads( open( json_parameter_file, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) username = datasource_params.get( "gs-username", None ) token = datasource_params.get( "gs-token", None ) assert None not in [ username, token ], "Missing GenomeSpace username or token." output_filename = datasource_params.get( "output", None ) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener( username, token ) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[ genomespace_site ] set_genomespace_format_identifiers( url_opener, genomespace_site_dict['dmServer'] ) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb' ) #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = json_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = json_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) file_numbers = [] for name in datasource_params.keys(): if name.startswith( file_url_prefix ): name = name[len( file_url_prefix ):] file_numbers.append( int( name ) ) if not file_numbers: if output_filename: open( output_filename, 'wb' ) #erase contents of file raise Exception( "You must select at least one file to import into Galaxy." ) file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % ( file_url_prefix, file_num ) download_url = datasource_params.get( url_key, None ) if download_url is None: break filetype_key = "%s%i" % ( file_type_prefix, file_num ) filetype_url = datasource_params.get( filetype_key, None ) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url ) formated_download_url = "%s?%s" % ( download_url, urllib.urlencode( [ ( 'dataformat', filetype_url ) ] ) ) new_file_request = urllib2.Request( formated_download_url ) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open( new_file_request ) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else ( x.strip(),'' ), target_download_url.info()['Content-Disposition'].split( ';' ) ) ) if 'filename' in content_disposition: filename = content_disposition[ 'filename' ].strip( "\"'" ) if not filename: parsed_url = urlparse.urlparse( download_url ) query_params = urlparse.parse_qs( parsed_url[4] ) filename = urllib.unquote_plus( parsed_url[2].split( '/' )[-1] ) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join( c in VALID_CHARS and c or '-' for c in filename ) while filename in used_filenames: filename = "-%s" % filename used_filenames.append( filename ) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % ( hda_id, filename, galaxy_ext ) ) metadata_dict = dict( type = 'new_primary_dataset', base_dataset_id = dataset_id, ext = galaxy_ext, filename = output_filename, name = "GenomeSpace import on %s" % ( original_filename ) ) else: if dataset_id is not None: metadata_dict = dict( type = 'dataset', dataset_id = dataset_id, ext = galaxy_ext, name = "GenomeSpace import on %s" % ( filename ) ) output_file = open( output_filename, 'wb' ) chunk_write( target_download_url, output_file ) output_file.close() if ( galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN ) and metadata_dict: #try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry ) except: #sniff failed galaxy_ext = original_filename.rsplit( '.', 1 )[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict[ 'ext' ] = galaxy_ext output_filename = None #only have one filename available #write out metadata info if metadata_dict: metadata_parameter_file.write( "%s\n" % json.dumps( metadata_dict ) ) metadata_parameter_file.close() return True
pass else: # this should not happen, but it's here just in case shutil.copy(dataset.path, output_path) else: shutil.move(dataset.path, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout='uploaded %s file' % data_type, name=dataset.name, line_count=line_count) json_file.write(to_json_string(info) + "\n") # Groom the dataset content if necessary datatype = Registry().get_datatype_by_extension(ext) datatype.groom_dataset_content(output_path) def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None:
from sqlalchemy.sql import label # noqa sys.path.insert( 1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'lib'))) from galaxy.datatypes.registry import Registry from galaxy.model import * # noqa from galaxy.model import set_datatypes_registry # More explicit than `*` import from galaxy.model.mapping import init from galaxy.model.orm.scripts import get_config if sys.version_info > (3, ): long = int registry = Registry() registry.load_datatypes() set_datatypes_registry(registry) db_url = get_config(sys.argv)['db_url'] sa_session = init('/tmp/', db_url).context # Helper function for debugging sqlalchemy queries... # http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query def printquery(statement, bind=None): """ Print a query, with values filled in for debugging purposes *only* for security, you should always separate queries from their values please also note that this function is quite slow """
elif dataset.type in ("server_dir", "path_paste"): shutil.copy(dataset.path, output_path) else: shutil.move(dataset.path, output_path) # Write the job info info = dict( type="dataset", dataset_id=dataset.dataset_id, ext=ext, stdout="uploaded %s file" % data_type, name=dataset.name, line_count=line_count, ) json_file.write(to_json_string(info) + "\n") # Groom the dataset content if necessary datatype = Registry().get_datatype_by_extension(ext) datatype.groom_dataset_content(output_path) def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: file_err("A required composite data file was not provided (%s)" % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: if not value.is_binary: if uploaded_dataset.composite_files[value.name].space_to_tab: sniff.convert_newlines_sep2tabs(dataset.composite_file_paths[value.name]["path"])