Пример #1
0
def pipe_unweighted_edgelist_to_convert(matrix, bin_filename):
    """ Pipe an unweighted edgelist (COO sparse matrix) to Louvain's convert utility """
    devnull = open(os.devnull, 'w')

    proc = tk_subproc.Popen([
        LOUVAIN_CONVERT_BINPATH,
        '-i',
        '-',
        '-o',
        bin_filename,
    ],
                            stdin=subprocess.PIPE,
                            stdout=devnull,
                            stderr=devnull)

    # Stream text triplets to 'convert'
    print 'Writing %d elements.' % len(matrix.row)
    for ij in itertools.izip(matrix.row, matrix.col):
        proc.stdin.write('%d\t%d\n' % ij)

    proc.stdin.close()
    proc.wait()
    devnull.close()

    if proc.returncode != 0:
        raise Exception("'convert' command failed with exit code %d" %
                        proc.returncode)
def pipe_weighted_edgelist_to_convert(matrix, bin_filename, weight_filename):
    """ Pipe a weighted edgelist (COO sparse matrix) to Louvain's convert utility """
    raise ValueError('Unsupported method at the moment')

    devnull = open(os.devnull, 'w')

    proc = tk_subproc.Popen([
        LOUVAIN_CONVERT_BINPATH,
        '-i',
        '/dev/stdin',
        '-o',
        bin_filename,
        '-w',
        weight_filename,
    ],
                            stdin=subprocess.PIPE,
                            stdout=devnull,
                            stderr=devnull)

    # Stream text triplets to 'convert'
    for ijx in itertools.izip(matrix.row, matrix.col, matrix.data):
        proc.stdin.write('%d\t%d\t%f\n' % ijx)

    proc.stdin.close()
    proc.wait()
    devnull.close()
Пример #3
0
 def decompress(self, compressed):
     cwd = os.path.dirname(os.path.realpath(__file__))
     p = tk_subproc.Popen(['node', 'decompress.js'],
                          stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          cwd=cwd)
     stdout, _ = p.communicate(input=compressed)
     self.assertTrue(p.returncode == 0)
     return stdout
def pipe_unweighted_edgelist_to_convert(matrix, bin_filename):
    """ Pipe an unweighted edgelist (COO sparse matrix) to Louvain's convert utility """

    proc = tk_subproc.Popen([
        LOUVAIN_CONVERT_BINPATH,
        '-i',
        '-',
        '-o',
        bin_filename,
    ],
                            stdin=subprocess.PIPE)

    # Check if the process terminated early
    time.sleep(3)
    retcode = proc.poll()
    if retcode is not None:
        proc.stdin.close()
        proc.wait()
        raise Exception(
            "'convert' command terminated early with exit code %d" %
            proc.returncode)

    # Stream text triplets to 'convert'
    print 'Writing %d elements.' % len(matrix.row)

    try:
        for ij in itertools.izip(matrix.row, matrix.col):
            proc.stdin.write('%d\t%d\n' % ij)
        proc.stdin.close()
    except IOError as e:
        if e.errno == errno.EPIPE:
            proc.stdin.close()
            proc.wait()
            raise Exception(
                "'convert' binary closed the pipe before we finished writing to it. It terminated with exit code %d"
                % proc.returncode)

    proc.wait()

    if proc.returncode != 0:
        raise Exception("'convert' command failed with exit code %d" %
                        proc.returncode)

    if not os.path.exists(bin_filename):
        raise Exception(
            "'convert' failed to write the matrix file. Please see the standard error file (_stderr) to see if it emitted any errors."
        )
Пример #5
0
    def __init__(self, *args, **kwargs):
        mode = kwargs.pop("mode", "r")
        if mode == "r":
            kwargs["stdout"] = subprocess.PIPE
        elif mode == "w":
            kwargs["stdin"] = subprocess.PIPE
        else:
            raise ValueError("mode %s unsupported" % self.mode)

        kwargs["preexec_fn"] = os.setsid
        print args[0]
        sys.stdout.flush()
        self.proc = tk_subproc.Popen(*args, **kwargs)

        if mode == "r":
            self.pipe = self.proc.stdout
        elif mode == "w":
            self.pipe = self.proc.stdin
Пример #6
0
    def __init__(self, *args, **kwargs):
        mode = kwargs.pop('mode', 'r')
        if mode == 'r':
            kwargs['stdout'] = subprocess.PIPE
        elif mode == 'w':
            kwargs['stdin'] = subprocess.PIPE
        else:
            raise ValueError('mode %s unsupported' % self.mode)

        kwargs['preexec_fn'] = os.setsid
        print args[0]
        sys.stdout.flush()
        self.proc = tk_subproc.Popen(*args, **kwargs)

        if mode == 'r':
            self.pipe = self.proc.stdout
        elif mode == 'w':
            self.pipe = self.proc.stdin
Пример #7
0
    def align(self,
              read1_fastq_fn,
              read2_fastq_fn,
              out_genome_bam_fn,
              threads,
              cwd=None,
              max_report_alignments_per_read=-1,
              read_group_tags=None):
        if cwd is None:
            cwd = os.getcwd()

        if read2_fastq_fn is None:
            read2_fastq_fn = ''

        args = [
            'STAR',
            '--genomeDir',
            self.reference_star_path,
            '--outSAMmultNmax',
            str(max_report_alignments_per_read),
            '--runThreadN',
            str(threads),
            '--readNameSeparator',
            'space',
            '--outFilterMismatchNmax',
            '0',  ## Manually added to ensure no mismatches for sgRNA libraries like those for cropseq fw_20181218
            '--outSAMunmapped',
            'Within',
            '--outSAMtype',
            'SAM',
            '--outStd',
            'SAM',
            '--outSAMorder',
            'PairedKeepInputOrder',
        ]

        if read_group_tags is not None:
            args.append('--outSAMattrRGline')
            args.extend(read_group_tags)

        args.append('--readFilesIn')
        if read1_fastq_fn.endswith(cr_constants.GZIP_SUFFIX):
            args.append('<(gzip -c -d \'%s\')' % read1_fastq_fn)
            if read2_fastq_fn:
                args.append('<(gzip -c -d \'%s\')' % read2_fastq_fn)

        elif read1_fastq_fn.endswith(cr_constants.LZ4_SUFFIX):
            args.append('<(lz4 -c -d \'%s\')' % read1_fastq_fn)
            if read2_fastq_fn:
                args.append('<(lz4 -c -d \'%s\')' % read2_fastq_fn)

        else:
            args.append(read1_fastq_fn)
            if read2_fastq_fn:
                args.append(read2_fastq_fn)

        if out_genome_bam_fn == cr_constants.BAM_FILE_STREAM:
            # stream to pipe for downstream processing
            # NOTE: this feature is unused in the standard pipeline
            # HACK: see https://github.com/pysam-developers/pysam/issues/355
            parent_read, child_write = os.pipe()
            try:
                tk_subproc.Popen(args, stdout=child_write)
            finally:
                os.close(child_write)
            os.dup2(parent_read, sys.stdin.fileno())
            # now streaming output can be read using pysam.Samfile('-', 'r')
            # NOTE: since this does not await termination of the process, we can't reliably check the return code
        else:
            # NOTE: We'd like to pipe fastq files through a decompressor and feed those
            # streams into STAR.
            # STAR provides --readFilesCommand which will do this. But it uses a named pipe which
            # breaks on some filesystems.

            # We could also use anonymous pipes but we'd need a way to refer to them
            # on the command line and apparently not all systems support the same
            # /dev/fdN or procfs-like paths.

            # So we're forced to use the shell and process subsitution, as is recommended
            # here: https://groups.google.com/forum/#!msg/rna-star/MQdL1WxkAAw/eG6EatoOCgAJ

            # Wrap arguments in single quotes
            quoted_args = []
            for arg in args:
                if arg.startswith('<'):
                    # We want the shell to interpret this as a process substitution
                    quoted_args.append(arg)

                elif "'" in arg:
                    # We can't escape single quotes within single quoted strings.
                    # But we can concatenate different quoting mechanisms.
                    # ' => '"'"'
                    # This is relevant if the RG string contains quotes, which
                    # can happen if the user specifies such a library name.
                    arg = arg.replace("'", "'\"'\"'")
                    quoted_args.append("'%s'" % arg)

                else:
                    # Normal argument
                    quoted_args.append("'%s'" % arg)

            star_cmd = ' '.join(quoted_args)

            star = tk_subproc.Popen(star_cmd,
                                    stdout=subprocess.PIPE,
                                    cwd=cwd,
                                    shell=True,
                                    executable='bash')
            star_log = os.path.join(cwd, 'Log.out')

            with open(out_genome_bam_fn, 'w') as f:
                view_cmd = ['samtools', 'view', '-Sb', '-']
                view = tk_subproc.Popen(view_cmd,
                                        stdin=star.stdout,
                                        stdout=f,
                                        cwd=cwd)
                view.communicate()

            try:
                # Ensure that STAR process terminated so we can get a returncode
                star.communicate()
                cr_utils.check_completed_process(star, args[0])

                # check samtools status
                cr_utils.check_completed_process(view, ' '.join(view_cmd))

            except cr_utils.CRCalledProcessError as e:
                # Give the user the path to STAR's log
                raise cr_utils.CRCalledProcessError(
                    e.msg + ' Check STAR logs for errors: %s .' % star_log)

            # check for empty BAM
            if tk_bam.bam_is_empty(out_genome_bam_fn):
                raise Exception(
                    'Aligned BAM is empty - check STAR logs for errors: %s .' %
                    star_log)
Пример #8
0
def run_plsa(matrix,
             temp_dir,
             plsa_features=None,
             plsa_bcs=None,
             n_plsa_components=None,
             random_state=None,
             threads=1,
             min_count_threshold=0):
    """ Run a PLSA on the matrix using the IRLBA matrix factorization algorithm.  Prior to the PLSA analysis, the
    matrix is not normalized at all.

    If desired, only a subset of features (e.g. sample rows) can be selected for PLSA analysis.  Each feature is ranked
    by its dispersion relative to other features that have a similar mean count.  The top `plsa_features` as ranked by
    this method will then be used for the PLSA.

    One *cannot* select to subset number of barcodes to use because of the intricacies of PLSA. It is still available as
    an optional input to match the API for lsa and pca subroutines included in this package.

    Args:
        matrix (CountMatrix): The matrix to perform PLSA on.
        plsa_features (int): Number of features to subset from matrix and use in PLSA. The top plsa_features ranked by
                            dispersion are used
        plsa_bcs (int): Number of barcodes to randomly sample for the matrix.
        n_plsa_components (int): How many PLSA components should be used.
        random_state (int): The seed for the RNG
        min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PLSA
                                   (this filter is prior to any subsetting that occurs).
    Returns:
        A PLSA object
    """

    if not os.path.exists(temp_dir):
        raise Exception(
            'Temporary directory does not exist. Need it to run plsa binary. Aborting..'
        )

    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE
    np.random.seed(0)

    # Threshold the rows/columns of matrix, will throw error if an empty matrix results.
    thresholded_matrix, thresholded_bcs, thresholded_features = matrix.select_axes_above_threshold(
        min_count_threshold)

    # If requested, we can subsample some of the barcodes to get a smaller matrix for PLSA
    if plsa_bcs is not None:
        msg = "PLSA method does not allow subsetting barcodes"
        print(msg)
    plsa_bcs = thresholded_matrix.bcs_dim
    plsa_bc_indices = np.arange(thresholded_matrix.bcs_dim)

    # If requested, select fewer features to use by selecting the features with highest normalized dispersion
    if plsa_features is None:
        plsa_features = thresholded_matrix.features_dim
    elif plsa_features > thresholded_matrix.features_dim:
        msg = (
            "You requested {} features but the matrix after thresholding only included {} features,"
            "so the smaller amount is being used.").format(
                plsa_features, thresholded_matrix.features_dim)
        print(msg)
        plsa_features = thresholded_matrix.features_dim
    # Calc mean and variance of counts after normalizing
    # But don't transform to log space, in order to preserve the mean-variance relationship
    m = analysis_stats.normalize_by_umi(thresholded_matrix)
    # Get mean and variance of rows
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?
    plsa_feature_indices = np.argsort(dispersion)[-plsa_features:]

    # Now determine how many components.
    if n_plsa_components is None:
        n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT

    likely_matrix_rank = min(plsa_features, plsa_bcs)
    if likely_matrix_rank < n_plsa_components:
        print((
            "There are fewer nonzero features or barcodes ({}) than requested "
            "PLSA components ({}); reducing the number of components.").format(
                likely_matrix_rank, n_plsa_components))
        n_plsa_components = likely_matrix_rank

    if (likely_matrix_rank * 0.5) <= float(n_plsa_components):
        print(
            "Requested number of PLSA components is large relative to the matrix size, an exact approach to matrix factorization may be faster."
        )

    plsa_mat = thresholded_matrix.select_barcodes(
        plsa_bc_indices).select_features(plsa_feature_indices)

    # Write out sparse matrix without transforms
    # code picked up from save_mex
    plsa_mat.tocoo()
    out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx')
    with open(out_matrix_fn, 'w') as stream:
        stream.write(
            np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n%%\n'.format(
                'coordinate', 'integer', 'symmetry')))
        stream.write(
            np.compat.asbytes(
                '%i %i %i\n' %
                (plsa_mat.m.shape[0], plsa_mat.m.shape[1], plsa_mat.m.nnz)))
        # write row, col, val in 1-based indexing
        for r, c, d in itertools.izip(plsa_mat.m.row + 1, plsa_mat.m.col + 1,
                                      plsa_mat.m.data):
            stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d))))

    del plsa_mat

    # Run plsa module, reading in sparse matrix
    # Iters and tol are designed for 15PCs
    proc = tk_subproc.Popen([
        PLSA_BINPATH, out_matrix_fn, temp_dir, '--topics',
        str(n_plsa_components), '--iter',
        str(3000), '--tol',
        str(0.002), '--nt',
        str(threads)
    ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    stdout_data, stderr_data = proc.communicate()
    if proc.returncode != 0:
        print stdout_data
        raise Exception(
            "%s returned error code while running plsa binary %d: %s" %
            (proc, proc.returncode, stderr_data))

    # Read back data
    transformed_plsa_em_matrix_file = os.path.join(temp_dir,
                                                   "transformed_matrix.csv")
    n_components_file = os.path.join(temp_dir, "components.csv")
    variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv")
    org_rows_used = get_original_columns_used(thresholded_bcs, plsa_bc_indices)
    transformed_plsa_em_matrix = np.zeros((matrix.bcs_dim, n_plsa_components))
    transformed_plsa_em_matrix[org_rows_used, :] = np.genfromtxt(
        transformed_plsa_em_matrix_file, delimiter=",").astype('float64')
    org_cols_used = get_original_columns_used(thresholded_features,
                                              plsa_feature_indices)
    plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim))
    plsa_em_components[:, org_cols_used] = np.genfromtxt(
        n_components_file, delimiter=",").astype('float64')
    variance_explained = np.genfromtxt(variance_explained_file,
                                       delimiter=",").astype('float64')

    # reorder components by variance explained as PLSA binary gives arbitrary order
    new_order = range(n_plsa_components)
    variance_explained, new_order = zip(
        *sorted(zip(variance_explained, new_order), reverse=True))
    variance_explained = np.array(variance_explained)
    plsa_em_components = plsa_em_components[new_order, :]
    transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order]

    # delete files
    cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True)
    cr_io.remove(n_components_file, allow_nonexisting=True)
    cr_io.remove(variance_explained_file, allow_nonexisting=True)
    cr_io.remove(out_matrix_fn, allow_nonexisting=True)

    features_selected = np.array(
        [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used]

    # sanity check dimensions
    assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim,
                                                n_plsa_components)
    assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_plsa_components, )

    return PLSA(transformed_plsa_em_matrix, plsa_em_components,
                variance_explained, dispersion, features_selected)
Пример #9
0
def run_cutadapt_single_end(in_reads_fn,
                            out_reads_fn,
                            trim_info_fn,
                            trim_def,
                            adapters,
                            read_id="R1"):
    """Calls cutadapt in single-end mode using the settings in trim_def[read_id]
    """
    filter_output = trim_def["discard_untrimmed"]
    if "trim_length" in trim_def:
        fixed_length = trim_def["trim_length"]
    else:
        fixed_length = None

    martian.log_info("Trim definition provided:\n{}".format(trim_def))
    martian.log_info("(Using info for read {}".format(read_id))
    martian.log_info("Adapter sequences provided:\n{}".format(adapters))

    seqs_to_trim = {}
    for direction in ["5prime", "3prime"]:
        if read_id in trim_def and direction in trim_def[read_id]:
            seqs_to_trim[direction] = "".join(
                adapters[idx] for idx in trim_def[read_id][direction])
        else:
            seqs_to_trim[direction] = None

    cmd = ["cutadapt"]
    if fixed_length is not None:
        cmd.extend(["--length", "{}".format(fixed_length)])

    start_trim = seqs_to_trim["5prime"]
    end_trim = seqs_to_trim["3prime"]
    read_trim = start_trim is not None or end_trim is not None
    if start_trim is not None and end_trim is not None:
        # This is a linked adapter trim that anchors the 5prime end adapter to the beginning of the read
        # and lets the 3prime adapter float
        cmd.extend(["-a", "{}...{}".format(start_trim, end_trim)])
    elif start_trim is not None:
        # Just the anchored 5prime end adapter
        cmd.extend(["-g", "^{}".format(start_trim)])
    elif end_trim is not None:
        # Just the floating 3prime end adapter
        cmd.extend(["-a", end_trim])

    if filter_output and read_trim:
        cmd.append("--discard-untrimmed")

    cmd.extend(["--info-file", trim_info_fn])
    cmd.extend(["-o", out_reads_fn])
    cmd.append(in_reads_fn)

    martian.log_info("Cutadapt command: \n{}".format(" ".join(cmd)))

    process = tk_subproc.Popen(cmd, stdin=None, stdout=PIPE, stderr=PIPE)
    (stdout, stderr) = process.communicate()
    if process.returncode != 0:
        martian.log_info("Error while running cutadapt: \n{}".format(stderr))
        raise ValueError("Cutadapt failed")

    martian.log_info("Cutadapt output: \n{}".format(stdout))

    input_read_pairs, output_read_pairs = None, None
    for line in stdout.split("\n"):
        if line.startswith("Total reads processed:"):
            input_read_pairs = int(line.split(":")[1].replace(",", ""))
        if line.startswith("Reads written (passing filters):"):
            output_read_pairs = int(
                line.split(":")[1].split("(")[0].replace(",", ""))

    return input_read_pairs, output_read_pairs
Пример #10
0
def run_plsa(matrix,
             temp_dir,
             plsa_features=None,
             plsa_bcs=None,
             n_plsa_components=None,
             random_state=None,
             threads=1):
    if not os.path.exists(temp_dir):
        raise Exception(
            'Temporary directory does not exist. Need it to run plsa binary. Aborting..'
        )

    if plsa_features is None:
        plsa_features = matrix.features_dim
    if plsa_bcs is None:
        plsa_bcs = matrix.bcs_dim
    if n_plsa_components is None:
        n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT
        if n_plsa_components > plsa_features:
            print "There are fewer nonzero features than PLSA components; reducing the number of components."
            n_plsa_components = plsa_features
    if random_state is None:
        random_state = analysis_constants.RANDOM_STATE

    np.random.seed(random_state)

    # initialize PLSA subsets
    plsa_bc_indices = np.arange(matrix.bcs_dim)
    plsa_feature_indices = np.arange(matrix.features_dim)

    # NOTE: This is retained simply to follow PCA code
    # Calc mean and variance of counts after normalizing
    # Don't transform to log space in PLSA
    # Dispersion is not exactly meaningful after idf transform.
    m = analysis_stats.normalize_by_idf(matrix)
    (mu, var) = analysis_stats.summarize_columns(m.T)
    dispersion = analysis_stats.get_normalized_dispersion(
        mu.squeeze(), var.squeeze())  # TODO set number of bins?

    plsa_feature_indices = np.argsort(dispersion)[-plsa_features:]

    if plsa_bcs < matrix.bcs_dim:
        plsa_bc_indices = np.sort(
            np.random.choice(np.arange(matrix.bcs_dim),
                             size=plsa_bcs,
                             replace=False))

    plsa_mat, _, plsa_features_nonzero = matrix.select_barcodes(
        plsa_bc_indices).select_features(
            plsa_feature_indices).select_nonzero_axes()
    plsa_feature_nonzero_indices = plsa_feature_indices[plsa_features_nonzero]

    if plsa_mat.features_dim < 2 or plsa_mat.bcs_dim < 2:
        print "Matrix is too small for further downsampling - num_plsa_bcs and num_plsa_features will be ignored."
        plsa_mat, _, plsa_features_nonzero = matrix.select_nonzero_axes()
        plsa_feature_nonzero_indices = plsa_features_nonzero

    ### Write out sparse matrix without transforms
    plsa_mat.tocoo()
    out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx')
    sp_io.mmwrite(out_matrix_fn,
                  plsa_mat.m,
                  field='integer',
                  symmetry='general')

    ### Run plsa module, reading in sparse matrix
    proc = tk_subproc.Popen([
        PLSA_BINPATH,
        out_matrix_fn,
        temp_dir,
        '--topics',
        str(n_plsa_components),
        '--nt',
        str(threads),
    ],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    stdout_data, stderr_data = proc.communicate()
    if proc.returncode != 0:
        print stdout_data
        raise Exception(
            "%s returned error code while running plsa binary %d: %s" %
            (proc, proc.returncode, stderr_data))

    ### Read back data
    transformed_plsa_em_matrix_file = os.path.join(temp_dir,
                                                   "transformed_matrix.csv")
    n_components_file = os.path.join(temp_dir, "components.csv")
    variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv")
    transformed_plsa_em_matrix = np.genfromtxt(transformed_plsa_em_matrix_file,
                                               delimiter=",").astype('float64')
    plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim))
    plsa_em_components[:, plsa_feature_nonzero_indices] = np.genfromtxt(
        n_components_file, delimiter=",").astype('float64')
    variance_explained = np.genfromtxt(variance_explained_file,
                                       delimiter=",").astype('float64')

    ### reorder components by variance explained as PLSA binary gives arbitrary order
    new_order = range(n_plsa_components)
    variance_explained, new_order = zip(
        *sorted(zip(variance_explained, new_order), reverse=True))
    variance_explained = np.array(variance_explained)
    plsa_em_components = plsa_em_components[new_order, :]
    transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order]

    ### delete files
    cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True)
    cr_io.remove(n_components_file, allow_nonexisting=True)
    cr_io.remove(variance_explained_file, allow_nonexisting=True)
    cr_io.remove(out_matrix_fn, allow_nonexisting=True)

    features_selected = np.array([
        f.id for f in matrix.feature_ref.feature_defs
    ])[plsa_feature_nonzero_indices]

    # sanity check dimensions
    assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim,
                                                n_plsa_components)
    assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim)
    assert variance_explained.shape == (n_plsa_components, )

    return PLSA(transformed_plsa_em_matrix, plsa_em_components,
                variance_explained, dispersion, features_selected)
Пример #11
0
    def align(self, read1_fastq_fn, read2_fastq_fn,
              out_genome_bam_fn,
              threads, cwd=None,
              max_report_alignments_per_read=-1,
              read_group_tags=None):
        if cwd is None:
            cwd = os.getcwd()

        if read2_fastq_fn is None:
            read2_fastq_fn = ''

        args = [
            'STAR', '--genomeDir', self.reference_star_path,
            '--readFilesIn', read1_fastq_fn, read2_fastq_fn,
            '--outSAMmultNmax', str(max_report_alignments_per_read),
            '--runThreadN', str(threads),
            '--readNameSeparator', 'space',
            '--outSAMunmapped', 'Within',
            '--outSAMtype', 'SAM',
            '--outStd', 'SAM',
            '--outSAMorder', 'PairedKeepInputOrder',
        ]

        if read_group_tags is not None:
            args.append('--outSAMattrRGline')
            args.extend(read_group_tags)

        if read1_fastq_fn.endswith(cr_constants.GZIP_SUFFIX):
            args.extend(['--readFilesCommand', 'gzip -c -d'])
        if read1_fastq_fn.endswith(cr_constants.LZ4_SUFFIX):
            args.extend(['--readFilesCommand', 'lz4 -c -d'])

        if out_genome_bam_fn == cr_constants.BAM_FILE_STREAM:
            # stream to pipe for downstream processing
            # NOTE: this feature is unused in the standard pipeline
            # HACK: see https://github.com/pysam-developers/pysam/issues/355
            parent_read, child_write = os.pipe()
            try:
                tk_subproc.Popen(args, stdout=child_write)
            finally:
                os.close(child_write)
            os.dup2(parent_read, sys.stdin.fileno())
            # now streaming output can be read using pysam.Samfile('-', 'r')
            # NOTE: since this does not await termination of the process, we can't reliably check the return code
        else:
            star = tk_subproc.Popen(args, stdout=subprocess.PIPE, cwd=cwd)
            star_log = os.path.join(cwd, 'Log.out')

            with open(out_genome_bam_fn, 'w') as f:
                view_cmd = ['samtools', 'view', '-Sb', '-']
                view = tk_subproc.Popen(view_cmd, stdin=star.stdout, stdout=f, cwd=cwd)
                view.communicate()

            try:
                # Ensure that STAR process terminated so we can get a returncode
                star.communicate()
                cr_utils.check_completed_process(star, args[0])

                # check samtools status
                cr_utils.check_completed_process(view, ' '.join(view_cmd))

            except cr_utils.CRCalledProcessError as e:
                # Give the user the path to STAR's log
                raise cr_utils.CRCalledProcessError(e.msg + ' Check STAR logs for errors: %s .' % star_log)

            # check for empty BAM
            if tk_bam.bam_is_empty(out_genome_bam_fn):
                raise Exception('Aligned BAM is empty - check STAR logs for errors: %s .' % star_log )
Пример #12
0
def run_command_safely(cmd, args):
    p = tk_subproc.Popen([cmd] + args, stderr=subprocess.PIPE)
    _, stderr_data = p.communicate()
    if p.returncode != 0:
        raise Exception("%s returned error code %d: %s" %
                        (p, p.returncode, stderr_data))