def rsync_file(from_path, to_path): make_dirs(os.path.dirname(to_path)) subprocess_cmd = [ "rsync", "--verbose", "--itemize-changes", "--progress", "--chmod=D555", "--chmod=F444", "--times", "--copy-links", from_path, to_path, ] log.info(" ".join(subprocess_cmd)) # The following is a way to use the logging module with subprocess. # See # https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging. process = Popen(subprocess_cmd, stdout=PIPE, stderr=STDOUT) with process.stdout: for line in iter(process.stdout.readline, b""): log.info(line.rstrip()) exitcode = process.wait() if exitcode != 0: raise Exception("cmd '{}' returned {}".format(" ".join(subprocess_cmd), exitcode)) if os.path.getsize(to_path) != os.path.getsize(from_path): raise Exception("copy failed for %s to %s", from_path, to_path)
def run_analysis(results_dir): # Read in hmmcopy and align results logging.info(f'reading hmmcopy and align results from {results_dir}') hmmcopy_metadata = _get_hmmcopy_meta(results_dir) library_id = hmmcopy_metadata['meta']['library_id'] sample_ids = hmmcopy_metadata['meta']['sample_ids'] results = load_align_data(results_dir) results.update(load_hmmcopy_data(results_dir)) cn_data = results['hmmcopy_reads'] metrics_data = results['hmmcopy_metrics'] align_metrics_data = results['align_metrics'] # Calculating cell cycle state logging.info('calculating cell cycle state') cell_cycle_data = cell_cycle_classifier.api.train_classify(cn_data, metrics_data, align_metrics_data) # Writing out cell cycle state results output_cellcycle_dir = os.path.join( results_dir, 'results', results_type, ) cell_cycle_results_filename = f'{library_id}_{results_type}.csv' results_filepath = os.path.join( output_cellcycle_dir, cell_cycle_results_filename, ) metadata_filepath = os.path.join( output_cellcycle_dir, 'metadata.yaml', ) make_dirs(os.path.dirname(results_filepath)) cell_cycle_data.to_csv(results_filepath, index=False) metadata = { 'filenames': [ cell_cycle_results_filename, ], 'meta': { 'library_id': library_id, 'sample_id': sample_ids, 'type': results_type, 'version': results_version, } } with open(metadata_filepath, 'w') as f: yaml.dump(metadata, f, default_flow_style=False) return output_cellcycle_dir
def download_from_blob(self, file_instance, overwrite=False): """ Download file from blob to a server. This should be called on the from server. """ file_resource = file_instance["file_resource"] cloud_filepath = file_instance["filepath"] if not cloud_filepath.startswith(self.from_storage["prefix"]): raise Exception("{} does not have storage prefix {}".format( cloud_filepath, self.from_storage["prefix"])) cloud_blobname = file_resource["filename"] cloud_container = self.from_storage["storage_container"] local_filepath = os.path.join(self.to_storage_prefix, file_resource["filename"]) make_dirs(os.path.dirname(local_filepath)) self.tantalus_api.check_file(file_instance) # Check any existing file, skip if the same, raise error if different # and we are not overwriting if os.path.isfile(local_filepath): if _check_file_same_local(file_resource, local_filepath): logging.info( "skipping transfer of file resource {} that matches existing file" .format(file_resource["filename"])) return elif not overwrite: error_message = "target file {filepath} already exists on {storage} with different size".format( filepath=local_filepath, storage=self.to_storage_name, ) raise FileAlreadyExists(error_message) else: logging.info(f'removing existing file {local_filepath}') os.remove(local_filepath) if azcopy: blob_url = self.storage_client.get_url(cloud_blobname) run_azcopy(blob_url, local_filepath) else: self.block_blob_service.get_blob_to_path( cloud_container, cloud_blobname, local_filepath, progress_callback=TransferProgress().print_progress, max_connections=16, ) os.chmod(local_filepath, 0o444)
def rsync_file(from_path, to_path, sftp=None, remote_host=None): """ Rsyncs file and performs checks to ensure rsync was successful Args: from_path: (string) source path of the file to_path: (string) destination path of the file sftp: (sftp object) sftp client if rsync is performed remotely remote_host: (string) name of the remote host if rsync is remote """ # Prepend remote host shortcut to the source path if remote_host: transfer_from_path = remote_host + from_path else: transfer_from_path = from_path # Create the rsync command subprocess_cmd = [ "rsync", "-avPL", "--chmod=D555", "--chmod=F444", transfer_from_path, to_path, ] # Copy the file if it doesn't exist if not os.path.isfile(to_path): make_dirs(os.path.dirname(to_path)) logging.info("Copying file from {} to {}".format(from_path, to_path)) subprocess.check_call(subprocess_cmd) # If the file exists and we are using sftp, check size elif os.path.isfile(to_path) and sftp: remote_file = sftp.stat(from_path) if remote_file.st_size != os.path.getsize(to_path): logging.info( "The size of {} on the GSC does not match {} -- copying new file" .format( from_path, to_path, )) subprocess.check_call(subprocess_cmd) else: logging.info( "The file already exists at {} -- skipping import".format( to_path)) # If the file exists and we are not using sftp, check size elif os.path.isfile(to_path) and not sftp: if os.path.getsize(from_path) != os.path.getsize(to_path): logging.info( "The size of {} on the GSC does not match {} -- copying new file" .format( from_path, to_path, )) subprocess.check_call(subprocess_cmd) else: logging.info( "The file already exists at {} -- skipping import".format( to_path)) # Check the rsync was successful if sftp: try: remote_file = sftp.stat(from_path) if remote_file.st_size != os.path.getsize(to_path): raise Exception("copy failed for {} to {}".format( from_path, to_path)) except IOError: raise Exception("missing source file {}".format(from_path)) else: if os.path.getsize(to_path) != os.path.getsize(from_path): raise Exception("copy failed for {} to {}".format( from_path, to_path))
def rsync_file(self, file_instance, overwrite=False): """ Rsync a single file from one storage to another """ file_resource = file_instance["file_resource"] local_filepath = os.path.join(self.to_storage_prefix, file_resource["filename"]) remote_filepath = file_instance["filepath"] if file_instance["file_resource"]["is_folder"]: local_filepath = local_filepath + "/" remote_filepath = remote_filepath + "/" if os.path.isfile(local_filepath): if overwrite: logging.info(f'removing existing file {local_filepath}') os.remove(local_filepath) else: if _check_file_same_local(file_instance["file_resource"], local_filepath): logging.info( "skipping transfer of file resource {} that matches existing file" .format(file_resource["filename"])) return error_message = "target file {filepath} already exists on {storage} with different size".format( filepath=local_filepath, storage=self.to_storage_name) raise FileAlreadyExists(error_message) if file_instance["storage"]["server_ip"] == self.local_transfer: remote_location = remote_filepath else: remote_location = file_instance["storage"][ "server_ip"] + ":" + remote_filepath make_dirs(os.path.dirname(local_filepath)) subprocess_cmd = [ "rsync", "--progress", # '--info=progress2', "--chmod=D555", "--chmod=F444", "--times", "--copy-links", remote_location, local_filepath, ] if file_instance["file_resource"]["is_folder"]: subprocess_cmd.insert(1, "-r") sys.stdout.flush() sys.stderr.flush() subprocess.check_call(subprocess_cmd, stdout=sys.stdout, stderr=sys.stderr) if not _check_file_same_local(file_instance["file_resource"], local_filepath): error_message = "transfer to {filepath} on {storage} failed".format( filepath=local_filepath, storage=self.to_storage_name) raise Exception(error_message)
def copy(self, filename, new_filename, wait=None): filepath = os.path.join(self.storage_directory, filename) new_filepath = os.path.join(self.storage_directory, new_filename) if not os.path.exists(os.path.dirname(new_filepath)): make_dirs(os.path.dirname(new_filepath)) os.link(filepath, new_filepath)
def run_analysis(tantalus_api, jira_ticket, cellenone_dataset, inputs_storage_name, results_storage_name, archive_storage_name=None, update=False): results_storage = tantalus_api.get( 'storage', name=results_storage_name, ) assert len(cellenone_dataset['libraries']) == 1 library_pk = cellenone_dataset['libraries'][0]['id'] library_id = cellenone_dataset['libraries'][0]['library_id'] results_filename = os.path.join( 'single_cell_indexing', 'Cellenone', 'Cellenone_data', 'feature_tables', '{results_version}', '{library_id}.csv', ).format( results_version=results_version, library_id=library_id, ) results_filepath = os.path.join( results_storage['storage_directory'], results_filename, ) make_dirs(os.path.dirname(results_filepath)) cellenone_data = process_cellenone_table(tantalus_api, cellenone_dataset, inputs_storage_name) cellenone_data.to_csv(results_filepath) analysis_name = '{}_{}_{}'.format(analysis_type, analysis_version, library_id) analysis = tantalus_api.get_or_create( 'analysis', name=analysis_name, analysis_type=analysis_type, version=analysis_version, jira_ticket=jira_ticket, status='complete', args={}, input_results=[cellenone_dataset['id']], ) results_name = '{}_{}_{}'.format(results_type, results_version, library_id) results_file_resource, results_file_instance = tantalus_api.add_file( results_storage_name, results_filepath, update=update) results_file_pk = results_file_resource['id'] results = tantalus_api.get_or_create( 'results', name=results_name, results_type=results_type, results_version=results_version, libraries=[library_pk], analysis=analysis['id'], file_resources=[results_file_pk], ) if archive_storage_name is not None: datamanagement.transfer_files.transfer_dataset( tantalus_api, results['id'], 'resultsdataset', results_storage_name, archive_storage_name, )