Exemplo n.º 1
0
def rsync_file(from_path, to_path):
    make_dirs(os.path.dirname(to_path))

    subprocess_cmd = [
        "rsync",
        "--verbose",
        "--itemize-changes",
        "--progress",
        "--chmod=D555",
        "--chmod=F444",
        "--times",
        "--copy-links",
        from_path,
        to_path,
    ]

    log.info(" ".join(subprocess_cmd))

    # The following is a way to use the logging module with subprocess.
    # See
    # https://stackoverflow.com/questions/21953835/run-subprocess-and-print-output-to-logging.
    process = Popen(subprocess_cmd, stdout=PIPE, stderr=STDOUT)

    with process.stdout:
        for line in iter(process.stdout.readline, b""):
            log.info(line.rstrip())

    exitcode = process.wait()

    if exitcode != 0:
        raise Exception("cmd '{}' returned {}".format(" ".join(subprocess_cmd),
                                                      exitcode))

    if os.path.getsize(to_path) != os.path.getsize(from_path):
        raise Exception("copy failed for %s to %s", from_path, to_path)
Exemplo n.º 2
0
def run_analysis(results_dir):

    # Read in hmmcopy and align results
    logging.info(f'reading hmmcopy and align results from {results_dir}')

    hmmcopy_metadata = _get_hmmcopy_meta(results_dir)
    library_id = hmmcopy_metadata['meta']['library_id']
    sample_ids = hmmcopy_metadata['meta']['sample_ids']

    results = load_align_data(results_dir)
    results.update(load_hmmcopy_data(results_dir))

    cn_data = results['hmmcopy_reads']
    metrics_data = results['hmmcopy_metrics']
    align_metrics_data = results['align_metrics']

    # Calculating cell cycle state
    logging.info('calculating cell cycle state')
    cell_cycle_data = cell_cycle_classifier.api.train_classify(cn_data, metrics_data, align_metrics_data)

    # Writing out cell cycle state results
    output_cellcycle_dir = os.path.join(
        results_dir,
        'results',
        results_type,
    )

    cell_cycle_results_filename = f'{library_id}_{results_type}.csv'

    results_filepath = os.path.join(
        output_cellcycle_dir,
        cell_cycle_results_filename,
    )

    metadata_filepath = os.path.join(
        output_cellcycle_dir,
        'metadata.yaml',
    )

    make_dirs(os.path.dirname(results_filepath))

    cell_cycle_data.to_csv(results_filepath, index=False)

    metadata = {
        'filenames': [
            cell_cycle_results_filename,
        ],
        'meta': {
            'library_id': library_id,
            'sample_id': sample_ids,
            'type': results_type,
            'version': results_version,
        }
    }

    with open(metadata_filepath, 'w') as f:
        yaml.dump(metadata, f, default_flow_style=False)

    return output_cellcycle_dir
Exemplo n.º 3
0
    def download_from_blob(self, file_instance, overwrite=False):
        """ Download file from blob to a server.

        This should be called on the from server.
        """
        file_resource = file_instance["file_resource"]

        cloud_filepath = file_instance["filepath"]
        if not cloud_filepath.startswith(self.from_storage["prefix"]):
            raise Exception("{} does not have storage prefix {}".format(
                cloud_filepath, self.from_storage["prefix"]))

        cloud_blobname = file_resource["filename"]
        cloud_container = self.from_storage["storage_container"]

        local_filepath = os.path.join(self.to_storage_prefix,
                                      file_resource["filename"])

        make_dirs(os.path.dirname(local_filepath))

        self.tantalus_api.check_file(file_instance)

        # Check any existing file, skip if the same, raise error if different
        # and we are not overwriting
        if os.path.isfile(local_filepath):
            if _check_file_same_local(file_resource, local_filepath):
                logging.info(
                    "skipping transfer of file resource {} that matches existing file"
                    .format(file_resource["filename"]))
                return
            elif not overwrite:
                error_message = "target file {filepath} already exists on {storage} with different size".format(
                    filepath=local_filepath,
                    storage=self.to_storage_name,
                )
                raise FileAlreadyExists(error_message)
            else:
                logging.info(f'removing existing file {local_filepath}')
                os.remove(local_filepath)

        if azcopy:
            blob_url = self.storage_client.get_url(cloud_blobname)
            run_azcopy(blob_url, local_filepath)

        else:
            self.block_blob_service.get_blob_to_path(
                cloud_container,
                cloud_blobname,
                local_filepath,
                progress_callback=TransferProgress().print_progress,
                max_connections=16,
            )

        os.chmod(local_filepath, 0o444)
Exemplo n.º 4
0
def rsync_file(from_path, to_path, sftp=None, remote_host=None):
    """
    Rsyncs file and performs checks to ensure rsync was successful

    Args:
        from_path:      (string) source path of the file
        to_path:        (string) destination path of the file
        sftp:           (sftp object) sftp client if rsync is performed remotely
        remote_host:    (string) name of the remote host if rsync is remote
    """
    # Prepend remote host shortcut to the source path
    if remote_host:
        transfer_from_path = remote_host + from_path
    else:
        transfer_from_path = from_path

    # Create the rsync command
    subprocess_cmd = [
        "rsync",
        "-avPL",
        "--chmod=D555",
        "--chmod=F444",
        transfer_from_path,
        to_path,
    ]

    # Copy the file if it doesn't exist
    if not os.path.isfile(to_path):
        make_dirs(os.path.dirname(to_path))
        logging.info("Copying file from {} to {}".format(from_path, to_path))
        subprocess.check_call(subprocess_cmd)

    # If the file exists and we are using sftp, check size
    elif os.path.isfile(to_path) and sftp:
        remote_file = sftp.stat(from_path)
        if remote_file.st_size != os.path.getsize(to_path):
            logging.info(
                "The size of {} on the GSC does not match {} -- copying new file"
                .format(
                    from_path,
                    to_path,
                ))
            subprocess.check_call(subprocess_cmd)
        else:
            logging.info(
                "The file already exists at {} -- skipping import".format(
                    to_path))

    # If the file exists and we are not using sftp, check size
    elif os.path.isfile(to_path) and not sftp:
        if os.path.getsize(from_path) != os.path.getsize(to_path):
            logging.info(
                "The size of {} on the GSC does not match {} -- copying new file"
                .format(
                    from_path,
                    to_path,
                ))
            subprocess.check_call(subprocess_cmd)
        else:
            logging.info(
                "The file already exists at {} -- skipping import".format(
                    to_path))

    # Check the rsync was successful
    if sftp:
        try:
            remote_file = sftp.stat(from_path)
            if remote_file.st_size != os.path.getsize(to_path):
                raise Exception("copy failed for {} to {}".format(
                    from_path, to_path))
        except IOError:
            raise Exception("missing source file {}".format(from_path))
    else:
        if os.path.getsize(to_path) != os.path.getsize(from_path):
            raise Exception("copy failed for {} to {}".format(
                from_path, to_path))
Exemplo n.º 5
0
    def rsync_file(self, file_instance, overwrite=False):
        """ Rsync a single file from one storage to another
        """
        file_resource = file_instance["file_resource"]

        local_filepath = os.path.join(self.to_storage_prefix,
                                      file_resource["filename"])

        remote_filepath = file_instance["filepath"]

        if file_instance["file_resource"]["is_folder"]:
            local_filepath = local_filepath + "/"
            remote_filepath = remote_filepath + "/"

        if os.path.isfile(local_filepath):
            if overwrite:
                logging.info(f'removing existing file {local_filepath}')
                os.remove(local_filepath)

            else:
                if _check_file_same_local(file_instance["file_resource"],
                                          local_filepath):
                    logging.info(
                        "skipping transfer of file resource {} that matches existing file"
                        .format(file_resource["filename"]))
                    return

                error_message = "target file {filepath} already exists on {storage} with different size".format(
                    filepath=local_filepath, storage=self.to_storage_name)

                raise FileAlreadyExists(error_message)

        if file_instance["storage"]["server_ip"] == self.local_transfer:
            remote_location = remote_filepath
        else:
            remote_location = file_instance["storage"][
                "server_ip"] + ":" + remote_filepath

        make_dirs(os.path.dirname(local_filepath))

        subprocess_cmd = [
            "rsync",
            "--progress",
            # '--info=progress2',
            "--chmod=D555",
            "--chmod=F444",
            "--times",
            "--copy-links",
            remote_location,
            local_filepath,
        ]

        if file_instance["file_resource"]["is_folder"]:
            subprocess_cmd.insert(1, "-r")

        sys.stdout.flush()
        sys.stderr.flush()
        subprocess.check_call(subprocess_cmd,
                              stdout=sys.stdout,
                              stderr=sys.stderr)

        if not _check_file_same_local(file_instance["file_resource"],
                                      local_filepath):
            error_message = "transfer to {filepath} on {storage} failed".format(
                filepath=local_filepath, storage=self.to_storage_name)
            raise Exception(error_message)
Exemplo n.º 6
0
 def copy(self, filename, new_filename, wait=None):
     filepath = os.path.join(self.storage_directory, filename)
     new_filepath = os.path.join(self.storage_directory, new_filename)
     if not os.path.exists(os.path.dirname(new_filepath)):
         make_dirs(os.path.dirname(new_filepath))
     os.link(filepath, new_filepath)
Exemplo n.º 7
0
def run_analysis(tantalus_api,
                 jira_ticket,
                 cellenone_dataset,
                 inputs_storage_name,
                 results_storage_name,
                 archive_storage_name=None,
                 update=False):

    results_storage = tantalus_api.get(
        'storage',
        name=results_storage_name,
    )

    assert len(cellenone_dataset['libraries']) == 1
    library_pk = cellenone_dataset['libraries'][0]['id']
    library_id = cellenone_dataset['libraries'][0]['library_id']

    results_filename = os.path.join(
        'single_cell_indexing',
        'Cellenone',
        'Cellenone_data',
        'feature_tables',
        '{results_version}',
        '{library_id}.csv',
    ).format(
        results_version=results_version,
        library_id=library_id,
    )

    results_filepath = os.path.join(
        results_storage['storage_directory'],
        results_filename,
    )

    make_dirs(os.path.dirname(results_filepath))

    cellenone_data = process_cellenone_table(tantalus_api, cellenone_dataset,
                                             inputs_storage_name)
    cellenone_data.to_csv(results_filepath)

    analysis_name = '{}_{}_{}'.format(analysis_type, analysis_version,
                                      library_id)

    analysis = tantalus_api.get_or_create(
        'analysis',
        name=analysis_name,
        analysis_type=analysis_type,
        version=analysis_version,
        jira_ticket=jira_ticket,
        status='complete',
        args={},
        input_results=[cellenone_dataset['id']],
    )

    results_name = '{}_{}_{}'.format(results_type, results_version, library_id)

    results_file_resource, results_file_instance = tantalus_api.add_file(
        results_storage_name, results_filepath, update=update)
    results_file_pk = results_file_resource['id']

    results = tantalus_api.get_or_create(
        'results',
        name=results_name,
        results_type=results_type,
        results_version=results_version,
        libraries=[library_pk],
        analysis=analysis['id'],
        file_resources=[results_file_pk],
    )

    if archive_storage_name is not None:
        datamanagement.transfer_files.transfer_dataset(
            tantalus_api,
            results['id'],
            'resultsdataset',
            results_storage_name,
            archive_storage_name,
        )