Exemplo n.º 1
0
Arquivo: io.py Projeto: xtmgah/seqc
    def check_links(cls, input_args: list) -> None:
        """determine if valid arguments were passed before initiating run,
        specifically whether s3 links exist

        :param input_args: list of files that should be checked
        """

        s3 = boto3.resource('s3')
        for infile in input_args:
            try:
                if infile.startswith('s3://'):
                    if not infile.endswith(
                            '/'):  # check that s3 link for file exists
                        bucket, key = cls.split_link(infile)
                        s3.meta.client.head_object(Bucket=bucket, Key=key)
                    else:
                        cmd = 'aws s3 ls ' + infile  # directory specified in s3 link
                        res = check_output(cmd.split())
                        if b'PRE ' in res:  # subdirectories present
                            raise ValueError
            except CalledProcessError:
                log.notify(
                    'Failed to access %s with "aws s3 ls", check your link' %
                    infile)
                raise
            except ValueError:
                log.notify(
                    'Error: Provided s3 link "%s" does not contain the proper '
                    'input files to SEQC.' % infile)
                raise
            except ClientError:
                raise ValueError('s3 file %s not found.' % infile)
Exemplo n.º 2
0
 def check_key_file(self):
     """Checks the rsa file is present"""
     if not self.rsa_key:
         log.notify('The key %s was not found!' % self.rsa_key)
         raise FileNotFoundError(
             self._error_msg,
             'The key file %s does not exist' % self.rsa_key)
Exemplo n.º 3
0
def low_count(molecules, is_invalid, plot=False, ax=None):
    """
    updates is_invalid to reflect cells whose molecule counts are below the inflection
    point of an ecdf constructed from cell molecule counts. Typically this reflects cells
    whose molecule counts are approximately <= 100.

    :param molecules: scipy.stats.coo_matrix, molecule count matrix
    :param is_invalid:  np.ndarray(dtype=bool), declares valid and invalid cells
    :param bool plot: if True, plot a summary of the filter
    :param ax: Must be passed if plot is True. Indicates the axis on which to plot the
      summary.
    :return: is_invalid, np.ndarray(dtype=bool), updated valid and invalid cells
    """

    # copy, sort, and normalize molecule sums
    ms = np.ravel(molecules.tocsr()[~is_invalid, :].sum(axis=1))
    idx = np.argsort(ms)[::-1]  # largest cells first
    norm_ms = ms[idx] / ms[idx].sum()  # sorted, normalized array

    # identify inflection point from second derivative
    cms = np.cumsum(norm_ms)
    d1 = np.diff(pd.Series(cms).rolling(10).mean()[10:])
    d2 = np.diff(pd.Series(d1).rolling(10).mean()[10:])
    try:
        # throw out an extra 5% of cells from where the inflection point is found.
        # these cells are empirically determined to have "transition" library sizes
        # that confound downstream analysis
        inflection_pt = np.min(np.where(np.abs(d2) == 0)[0])
        inflection_pt = int(inflection_pt * 0.9)
    except ValueError as e:
        if e.args[0] == (
            "zero-size array to reduction operation minimum which has no " "identity"
        ):
            log.notify(
                "Low count filter passed-through; too few cells to estimate "
                "inflection point."
            )
            return is_invalid  # can't estimate validity
        else:
            raise

    vcrit = ms[idx][inflection_pt]

    is_invalid = is_invalid.copy()
    is_invalid[ms < vcrit] = True

    if plot and ax:
        cms /= np.max(cms)  # normalize to one
        ax.plot(np.arange(len(cms))[:inflection_pt], cms[:inflection_pt])
        ax.plot(np.arange(len(cms))[inflection_pt:], cms[inflection_pt:], c="indianred")
        ax.hlines(cms[inflection_pt], *ax.get_xlim(), linestyle="--")
        ax.vlines(inflection_pt, *ax.get_ylim(), linestyle="--")
        ax.set_xticklabels([])
        ax.set_xlabel("putative cell")
        ax.set_ylabel("ECDF (Cell Size)")
        ax.set_title("Cell Size")
        ax.set_ylim((0, 1))
        ax.set_xlim((0, len(cms)))

    return is_invalid
Exemplo n.º 4
0
Arquivo: ec2.py Projeto: hisplan/seqc
 def wrapper(*args, **kwargs):
     retries = self.retries
     while True:
         try:
             return function(*args, **kwargs)
         except self.exceptions_to_catch:
             if retries > 0:
                 retries -= 1
                 if self.verbose:
                     log.notify(
                         "Non fatal error in function {} (retrying in "
                         "{!s}s):\n{}".format(
                             function.__qualname__,
                             self.delay_retry,
                             traceback.format_exc(),
                         ))
                 time.sleep(self.delay_retry)
             else:
                 raise RetryLimitExceeded(
                     "fatal error in function {} occurred {} times at {!s}s call "
                     "interval:\n{}".format(
                         function.__qualname__,
                         self.retries,
                         self.delay_retry,
                         traceback.format_exc(),
                     ))
Exemplo n.º 5
0
Arquivo: io.py Projeto: xtmgah/seqc
    def download_awscli(cls,
                        link,
                        prefix='./',
                        overwrite=True,
                        recursive=False):
        """download file(s) at s3 address link to prefix using aws cli

        :param link:
        :param prefix:
        :param overwrite:
        :param recursive:
        :return list: all downloaded filenames
        """
        if prefix is '':
            prefix = './'

        if overwrite is False:
            raise ValueError(
                'downloads with awscli cannot control override behavior, '
                'any existing files will be overwritten.')
        if not recursive and link.endswith('/'):
            raise ValueError(
                'provided link %s was a prefix but download was not called recursively. '
                'Please provide a filename or download recursively.' % link)

        cmd = 'aws s3 cp %s %s' % (link, prefix)
        if recursive:
            cmd += ' --recursive'

        omit_cmd = cmd + cls.awscli_omit_existing_files(
            link, prefix, recursive)

        # download the files
        p = Popen(omit_cmd, shell=True, stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()
        if err:
            raise ChildProcessError(err.decode())

        # get the names of the files that were downloaded or already present
        d = Popen(cmd + ' --dryrun', shell=True, stdout=PIPE, stderr=PIPE)
        out, err = d.communicate()
        if err:
            raise ChildProcessError(err.decode())
        downloaded_files = sorted([
            line.strip().split()[-1]
            for line in out.decode().strip().split('\n') if line.strip()
        ])
        if log:
            log.notify('downloaded files:\n\t[%s]' %
                       ',\n\t'.join(downloaded_files))
        return downloaded_files
Exemplo n.º 6
0
Arquivo: ec2.py Projeto: hisplan/seqc
 def restart(self):
     """restarts a stopped instance"""
     if self.instance_id is None:
         raise RuntimeError(
             "Instance not yet created, nothing to be restarted.")
     instance = self.ec2.Instance(self.instance_id)
     if instance.state["Name"] == "stopped":
         instance.start()
         instance.wait_until_running()
         log.notify("Stopped instance %s has restarted." % self.instance_id)
     else:
         log.notify(
             'Instance %s in state "%s" must be in a stopped state to be '
             "restarted." % (self.instance_id, instance.state["Name"]))
Exemplo n.º 7
0
 def restart(self):
     """restarts a stopped instance"""
     if self.instance_id is None:
         raise RuntimeError(
             'Instance not yet created, nothing to be restarted.')
     instance = self.ec2.Instance(self.instance_id)
     if instance.state['Name'] == 'stopped':
         instance.start()
         instance.wait_until_running()
         log.notify('Stopped instance %s has restarted.' % self.instance_id)
     else:
         log.notify(
             'Instance %s in state "%s" must be in a stopped state to be '
             'restarted.' % (self.instance_id, instance.state['Name']))
Exemplo n.º 8
0
 def enable_ssh(cls, security_group_id):
     security_group = cls.ec2.SecurityGroup(security_group_id)
     try:
         security_group.authorize_ingress(IpProtocol="tcp",
                                          CidrIp="0.0.0.0/0",
                                          FromPort=22,
                                          ToPort=22)
         security_group.authorize_ingress(
             SourceSecurityGroupName=security_group.description)
     except ClientError as e:  # todo figure out why this is happening
         if 'InvalidPermission.Duplicate' not in e.args[0]:
             raise
     log.notify('Enabled ssh access via port 22 for security group %s' %
                security_group_id)
Exemplo n.º 9
0
    def create_security_group(cls, name=None):
        """creates a new security group

        :param str name: optional, name of the security group to create. Note that this
          name must be unique or an error will be thrown. Default: SEQC + random 7-integer
          number.
        """

        # todo get list of existing groups; check against
        if name is None:
            name = 'SEQC-%07d' % random.randint(1, int(1e7))
        sg = cls.ec2.create_security_group(GroupName=name, Description=name)
        log.notify('Created new security group: %s (name=%s).' % (sg.id, name))
        return sg.id
Exemplo n.º 10
0
 def __enter__(self):
     try:
         self.setup_seqc()
     except:
         if self.synchronous and self.instance_id:
             log.notify(
                 'error occurred during setup, attemption instance termination'
             )
             log.exception()
             try:
                 self.terminate()
             except ClientError:
                 pass
         raise
     return self
Exemplo n.º 11
0
 def create_instance(self) -> None:
     if self.instance_id is not None:
         raise RuntimeError('instance %s already exists.' %
                            self.instance_id)
     if self.spot_bid:
         self.create_spot_instance()
     else:
         specification = self.launch_specification()
         specification['MinCount'] = specification['MaxCount'] = 1
         instance = self.ec2.create_instances(**specification)[0]
         self.instance_id = instance.id
         log.notify('instance %s created, waiting until running' %
                    self.instance_id)
         instance.wait_until_running()
         log.notify('instance %s in running state' % self.instance_id)
Exemplo n.º 12
0
Arquivo: ec2.py Projeto: hisplan/seqc
    def write_script(cls, argv, function) -> str:
        """generate a bash script that runs SEQC

        :param list argv: the original command-line arguments supplied by user
        :param object function: function to be called
        :return str: filename of the python script
        """
        script_name = "{}{!s}_{}.py".format(os.environ["TMPDIR"],
                                            random.randint(0, 1e9),
                                            function.__name__)
        script_body = (
            "#!/bin/bash -x" + "\n"
            "\n"
            "SEQC " + " ".join(argv) + " --local" +
            (" --terminate" if "--no-terminate" not in argv else "") + "\n")

        with open(script_name, "wt") as f:
            log.notify("writing script to file:\n%s" % script_body)
            f.write(script_body)
        return script_name
Exemplo n.º 13
0
    def __exit__(self, exc_type, exc_val, exc_tb):
        """If an exception occurs, log the exception, email if possible, then terminate
        the aws instance if requested by the user

        :param exc_type: type of exception encountered
        :param exc_val: value of exception
        :param exc_tb: exception traceback
        """

        # log any exceptions, set email body based on error / terminate status

        if exc_type is not None:
            log.exception()
            email_body = 'Process interrupted -- see attached error message'
        elif self.terminate:
            email_body = 'Process completed successfully -- see attached log'
            log.info(
                'Execution completed successfully, instance will be terminated.'
            )
        else:
            email_body = 'Process completed successfully -- see attached log'
            log.info('Execution completed successfully, but user requested no '
                     'termination. Instance will continue to run.')

        # todo this is the source of the second email for successful runs
        # email user if possible; catch exceptions if email fails.
        if self.email and self.mutt:
            log.notify('Emailing user.')
            try:
                self.email_user(attachment=self.log_name,
                                email_body=email_body,
                                email_address=self.email)
            except ChildProcessError:
                log.exception()

        # upload data if requested
        if self.aws_upload_key:
            log.notify('Uploading log to {}'.format(self.aws_upload_key))
            bucket, key = io.S3.split_link(self.aws_upload_key)

            @Retry(catch=Exception)
            def upload_file():
                io.S3.upload_file(self.log_name, bucket, key)

            upload_file()

        # terminate if no errors and debug is False
        if self.terminate:
            if exc_type and self.debug:
                return  # don't terminate if an error was raised and debug was set
            instance_id = self._get_instance_id()
            if instance_id is None:
                return  # todo notify if verbose
            ec2 = boto3.resource('ec2')
            instance = ec2.Instance(instance_id)
            log.notify(
                'instance %s termination requested. If successful, this is the '
                'final log entry.' % instance_id)
            instance.terminate()
            instance.wait_until_terminated()
Exemplo n.º 14
0
    def mount_volume(ssh, directory='/home/ec2-user'):
        """mount /dev/xvdf to /data given an ssh client with access to an instance

        :param str directory: directory to mount the drive to. Note that odd behavior may
          be encountered if the home directory is not the mount target, since paramiko
          automatically uses the home directory as the point of execution.
        """
        try:  # pass errors related to the drive already being mounted
            log.notify("Formatting and mounting /dev/xvdf to %s" % directory)
            ssh.execute(
                "sudo mkfs -t ext4 /dev/xvdf 2>&1")  # redir; errors invisible
            ssh.execute("sudo cp -a %s/. /tmp/directory/" %
                        directory)  # copy original
            ssh.execute("sudo mkdir -p %s" % directory)
            ssh.execute(
                "sudo mount /dev/xvdf %s && sudo cp -a /tmp/directory/. %s/" %
                (directory, directory))
            ssh.execute("sudo chown ec2-user:ec2-user %s/lost+found && "
                        "chmod 755 %s/lost+found" % (directory, directory))
            log.notify("Successfully mounted new volume onto %s." % directory)
        except ChildProcessError as e:
            if not ('mount: according to mtab, /dev/xvdf is already mounted on %s'
                    % directory in ' '.join(e.args[0])):
                raise
Exemplo n.º 15
0
Arquivo: ec2.py Projeto: hisplan/seqc
 def stop(self):
     """stops a running instance"""
     if self.instance_id is None:
         raise RuntimeError(
             "Instance not yet created, nothing to be stopped.")
     instance = self.ec2.Instance(self.instance_id)
     if instance.state["Name"] not in ("stopped", "terminated",
                                       "shutting-down"):
         log.notify("requesting termination of instance {id}".format(
             id=self.instance_id))
         instance.stop()
         instance.wait_until_stopped()
         log.notify("instance {id} stopped.".format(id=self.instance_id))
     else:
         log.notify("instance is not running")
Exemplo n.º 16
0
 def stop(self):
     """stops a running instance"""
     if self.instance_id is None:
         raise RuntimeError(
             'Instance not yet created, nothing to be stopped.')
     instance = self.ec2.Instance(self.instance_id)
     if instance.state['Name'] not in ('stopped', 'terminated',
                                       'shutting-down'):
         log.notify('requesting termination of instance {id}'.format(
             id=self.instance_id))
         instance.stop()
         instance.wait_until_stopped()
         log.notify('instance {id} stopped.'.format(id=self.instance_id))
     else:
         log.notify('instance is not running')
Exemplo n.º 17
0
 def terminate(self):
     """terminates an instance in any state (including stopped)"""
     if self.instance_id is None:
         raise RuntimeError(
             'Instance not yet created, nothing to be restarted.')
     instance = self.ec2.Instance(self.instance_id)
     if instance.state['Name'] not in ('terminated', 'shutting-down'):
         log.notify('requesting termination of instance {id}'.format(
             id=self.instance_id))
         instance.terminate()
         instance.wait_until_terminated()
         log.notify('instance {id} terminated.'.format(id=self.instance_id))
     else:
         log.notify(
             'Instance %s in state "%s" must be running to be stopped.' %
             (self.instance_id, instance.state['Name']))
Exemplo n.º 18
0
 def create_spot_instance(self):
     if not self.spot_bid:
         raise ValueError(
             'must pass constructor spot_bid price (float) to create a '
             'spot bid request.')
     response = self.client.request_spot_instances(
         DryRun=False,
         SpotPrice=str(self.spot_bid),
         LaunchSpecification=self.launch_specification())
     sir_id = response['SpotInstanceRequests'][0]['SpotInstanceRequestId']
     log.notify(
         'spot instance requested (%s), waiting for bid to be accepted.' %
         sir_id)
     self.instance_id = self.verify_spot_bid_fulfilled(sir_id)
     if self.instance_id is None:
         raise InstanceNotRunningError(
             'spot bid of %f was not fulfilled, please try a higher bid or '
         )
     log.notify(
         'spot bid accepted, waiting for instance (id=%s) to attain running '
         'state.' % self.instance_id)
     self.ec2.Instance(self.instance_id).wait_until_running()
     log.notify('spot instance (id=%s) in running state' % self.instance_id)
Exemplo n.º 19
0
Arquivo: run.py Projeto: baranwa2/seqc
def run(args) -> None:
    """Run SEQC on the files provided in args, given specifications provided on the
    command line

    :param args: parsed argv, produced by seqc.parser(). This function is only called
      when args.subprocess_name is "run".
    """

    # import inside module for pickle functionality
    # top 2 only needed for post-filtering

    import os
    import multiprocessing
    from seqc import log, ec2, platforms, io
    from seqc.sequence import fastq
    from seqc.alignment import star
    from seqc.email_ import email_user
    from seqc.read_array import ReadArray
    from seqc.core import verify, download
    from seqc import filter
    from seqc.sequence.gtf import GeneIntervals
    from seqc.summary.summary import Section, Summary
    import numpy as np
    import scipy.io
    from shutil import copyfile
    from seqc.summary.summary import MiniSummary
    from seqc.stats.mast import run_mast
    import logging
    logger = logging.getLogger('weasyprint')
    logger.handlers = []  # Remove the default stderr handler
    logger.setLevel(100)
    logger.addHandler(logging.FileHandler('weasyprint.log'))

    def determine_start_point(arguments) -> (bool, bool, bool):
        """
        determine where seqc should start based on which parameters were passed.

        :param arguments: Namespace object, result of ArgumentParser.parse_args()
        :returns merge, align, process_bamfile: indicates whether merging, alignment, and
          processing bamfiles should be executed.
        """
        if arguments.read_array:
            return False, False, False
        if arguments.alignment_file:
            return False, False, True
        if arguments.merged_fastq:
            return False, True, True
        else:
            return True, True, True

    def download_input(dir_, arguments):
        """parse input arguments and download any necessary data

        :param str dir_: directory to download data to
        :param arguments: namespace object from argparse
        :return args: updated namespace object reflecting local file paths of downloaded
          files
        """
        # download basespace data if necessary
        if arguments.basespace:
            arguments.barcode_fastq, arguments.genomic_fastq = io.BaseSpace.download(
                arguments.platform, arguments.basespace, dir_, arguments.basespace_token)

        # check for remote fastq file links
        arguments.genomic_fastq = download.s3_data(
            arguments.genomic_fastq, dir_ + '/genomic_fastq/')
        arguments.barcode_fastq = download.s3_data(
            arguments.barcode_fastq, dir_ + '/barcode_fastq/')

        # get merged fastq file, unzip if necessary
        arguments.merged_fastq = (
            download.s3_data([arguments.merged_fastq], dir_ + '/')[0] if
            arguments.merged_fastq is not None else None)

        # check if the index must be downloaded
        if any((arguments.alignment_file, arguments.read_array)):
            index_link = arguments.index + 'annotations.gtf'
        else:
            index_link = arguments.index
        download.s3_data([index_link], dir_ + '/index/')
        arguments.index = dir_ + '/index/'

        # check if barcode files must be downloaded
        arguments.barcode_files = download.s3_data(
            arguments.barcode_files, dir_ + '/barcodes/')

        # check if alignment_file needs downloading
        if arguments.alignment_file:
            arguments.alignment_file = download.s3_data(
                [arguments.alignment_file], dir_ + '/')[0]

        # check if readarray needs downloading
        if arguments.read_array:
            arguments.read_array = download.s3_data([arguments.read_array], dir_ + '/')[0]

        return arguments

    def merge_fastq_files(
            technology_platform, barcode_fastq: [str], output_stem: str,
            genomic_fastq: [str]) -> (str, int):
        """annotates genomic fastq with barcode information; merging the two files.

        :param technology_platform: class from platforms.py that defines the
          characteristics of the data being processed
        :param barcode_fastq: list of str names of fastq files containing barcode
          information
        :param output_stem: str, stem for output files
        :param genomic_fastq: list of str names of fastq files containing genomic
          information
        :returns str merged_fastq: name of merged fastq file
        """

        log.info('Merging genomic reads and barcode annotations.')
        merged_fastq = fastq.merge_paired(
            merge_function=technology_platform.merge_function,
            fout=output_stem + '_merged.fastq',
            genomic=genomic_fastq,
            barcode=barcode_fastq)

        # delete genomic/barcode fastq files after merged.fastq creation
        log.info('Removing original fastq file for memory management.')
        delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq)
        io.ProcessManager(delete_fastq).run_all()

        return merged_fastq

    def align_fastq_records(
            merged_fastq, dir_, star_args, star_index, n_proc,
            aws_upload_key) -> (str, str, io.ProcessManager):
        """
        Align fastq records.

        :param merged_fastq: str, path to merged .fastq file
        :param dir_: str, stem for output files
        :param star_args: dict, extra keyword arguments for STAR
        :param star_index: str, file path to directory containing STAR index
        :param n_proc: int, number of STAR processes to initiate
        :param aws_upload_key: str, location to upload files, or None if seqc was
          initiated from a merged fastq file.
        :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager)
          name of .sam file containing aligned reads, indicator of which data was used as
          input, and a ProcessManager for merged fastq files
        """
        log.info('Aligning merged fastq records.')
        alignment_directory = dir_ + '/alignments/'
        os.makedirs(alignment_directory, exist_ok=True)
        if star_args is not None:
            star_kwargs = dict(a.strip().split('=') for a in star_args)
        else:
            star_kwargs = {}
        bamfile = star.align(
            merged_fastq, star_index, n_proc, alignment_directory,
            **star_kwargs)

        if aws_upload_key:
            log.info('Gzipping merged fastq file.')
            if pigz:
                pigz_zip = "pigz --best -k -f {fname}".format(fname=merged_fastq)
            else:
                pigz_zip = "gzip -kf {fname}".format(fname=merged_fastq)
            pigz_proc = io.ProcessManager(pigz_zip)
            pigz_proc.run_all()
            pigz_proc.wait_until_complete()  # prevents slowing down STAR alignment
            merged_fastq += '.gz'  # reflect gzipped nature of file

            log.info('Uploading gzipped merged fastq file to S3.')
            merge_upload = 'aws s3 mv {fname} {s3link}'.format(
                fname=merged_fastq, s3link=aws_upload_key)
            upload_manager = io.ProcessManager(merge_upload)
            upload_manager.run_all()
        else:
            log.info('Removing merged fastq file for memory management.')
            rm_merged = 'rm %s' % merged_fastq
            io.ProcessManager(rm_merged).run_all()

            upload_manager = None
        return bamfile, upload_manager

    def create_read_array(bamfile, index, aws_upload_key, min_poly_t,
                          max_transcript_length):
        """Create or download a ReadArray object.

        :param max_transcript_length:
        :param str bamfile: filename of .bam file
        :param str index: directory containing index files
        :param str aws_upload_key: key where aws files should be uploaded
        :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid
        :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager
        """
        log.info('Filtering aligned records and constructing record database.')
        # Construct translator
        translator = GeneIntervals(
            index + 'annotations.gtf', max_transcript_length=max_transcript_length)
        read_array = ReadArray.from_alignment_file(
            bamfile, translator, min_poly_t)

        # converting sam to bam and uploading to S3, else removing bamfile
        if aws_upload_key:
            log.info('Uploading bam file to S3.')
            upload_bam = 'aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam'.format(
                fname=bamfile, s3link=aws_upload_key, prefix=args.output_prefix)
            print(upload_bam)
            upload_manager = io.ProcessManager(upload_bam)
            upload_manager.run_all()
        else:
            log.info('Removing bamfile for memory management.')
            rm_bamfile = 'rm %s' % bamfile
            io.ProcessManager(rm_bamfile).run_all()
            upload_manager = None
        return read_array, upload_manager

    # ######################## MAIN FUNCTION BEGINS HERE ################################

    log.setup_logger(args.log_name)

    with ec2.instance_clean_up(
            email=args.email, upload=args.upload_prefix, log_name=args.log_name,
            debug=args.debug, terminate=args.terminate
    ):
        pigz, mutt = verify.executables('pigz', 'mutt')
        if mutt:
            log.notify('mutt executable identified, email will be sent when run '
                       'terminates. ')
        else:
            log.notify('mutt was not found on this machine; an email will not be sent to '
                       'the user upon termination of SEQC run.')

        # turn off lower coverage filter for 10x
        if (args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3"):
            args.filter_low_coverage = False

        max_insert_size = args.max_insert_size
        if args.filter_mode == "scRNA-seq":
            # for scRNA-seq
            if (args.platform == "ten_x") or (args.platform == "ten_x_v2") or (args.platform == "ten_x_v3"):
                # set max_transcript_length (max_insert_size) = 10000
                max_insert_size = 10000
                log.notify("Full length transcripts are used for read mapping in 10x data.")
        elif args.filter_mode == "snRNA-seq":
            # for snRNA-seq
            # e.g. 2304700 # hg38
            # e.g. 4434881 # mm38
            max_insert_size = args.max_insert_size
        else:
            # all others
            max_insert_size = args.max_insert_size

        log.notify("max_insert_size is set to {}".format(max_insert_size))

        log.args(args)

        output_dir, output_prefix = os.path.split(args.output_prefix)
        if not output_dir:
            output_dir = '.'

        # check if the platform name provided is supported by seqc
        # todo move into verify for run
        platform_name = verify.platform_name(args.platform)
        platform = platforms.AbstractPlatform.factory(platform_name)  # returns platform

        n_processes = multiprocessing.cpu_count() - 1  # get number of processors

        merge, align, process_bamfile = determine_start_point(args)

        args = download_input(output_dir, args)

        if args.platform == "in_drop_v5":
            platform = platform.build_cb2_barcodes(args.barcode_files)
            log.notify("Built cb2 barcode hash for v5 barcodes.")

        if merge:
            if args.min_poly_t is None:  # estimate min_poly_t if it was not provided
                args.min_poly_t = filter.estimate_min_poly_t(
                    args.barcode_fastq, platform)
                log.notify('Estimated min_poly_t={!s}'.format(args.min_poly_t))

            args.merged_fastq = merge_fastq_files(
                platform, args.barcode_fastq, args.output_prefix, args.genomic_fastq)

        # SEQC was started from input other than fastq files
        if args.min_poly_t is None:
            args.min_poly_t = 0
            log.notify('Warning: SEQC started from step other than unmerged fastq with '
                       'empty --min-poly-t parameter. Continuing with --min-poly-t 0.')

        if align:
            upload_merged = args.upload_prefix if merge else None
            args.alignment_file, manage_merged = align_fastq_records(
                args.merged_fastq, output_dir, args.star_args,
                args.index, n_processes, upload_merged)
        else:
            manage_merged = None

        if process_bamfile:
            upload_bamfile = args.upload_prefix if align else None

            ra, manage_bamfile, = create_read_array(
                args.alignment_file, args.index, upload_bamfile, args.min_poly_t,
                max_insert_size)

        else:
            manage_bamfile = None
            ra = ReadArray.load(args.read_array)

        # create the first summary section here
        status_filters_section = Section.from_status_filters(ra, 'initial_filtering.html')
        sections = [status_filters_section]

        # Skip over the corrections if read array is specified by the user
        if not args.read_array:

            # Correct barcodes
            log.info('Correcting barcodes and estimating error rates.')
            error_rate = platform.apply_barcode_correction(ra, args.barcode_files)

            # Resolve multimapping
            log.info('Resolving ambiguous alignments.')
            mm_results = ra.resolve_ambiguous_alignments()

            # correct errors
            log.info('Identifying RMT errors.')
            platform.apply_rmt_correction(ra, error_rate)

            # Apply low coverage filter
            if platform.filter_lonely_triplets:
                log.info('Filtering lonely triplet reads')
                ra.filter_low_coverage(alpha=args.low_coverage_alpha)

            log.info('Saving read array.')
            ra.save(args.output_prefix + '.h5')

            # Summary sections
            # create the sections for the summary object
            sections += [
                Section.from_cell_barcode_correction(ra, 'cell_barcode_correction.html'),
                Section.from_rmt_correction(ra, 'rmt_correction.html'),
                Section.from_resolve_multiple_alignments(mm_results, 'multialignment.html')]

        # create a dictionary to store output parameters
        mini_summary_d = dict()

        # filter non-cells
        log.info('Creating counts matrix.')
        sp_reads, sp_mols = ra.to_count_matrix(
            sparse_frame=True, genes_to_symbols=args.index + 'annotations.gtf')

        # Save sparse matrices
        log.info('Saving sparse matrices')
        scipy.io.mmwrite(args.output_prefix + '_sparse_read_counts.mtx', sp_reads.data)
        scipy.io.mmwrite(args.output_prefix + '_sparse_molecule_counts.mtx', sp_mols.data)
        # Indices
        df = np.array([np.arange(sp_reads.shape[0]), sp_reads.index]).T
        np.savetxt(
            args.output_prefix + '_sparse_counts_barcodes.csv', df,
            fmt='%d', delimiter=',')
        # Columns
        df = np.array([np.arange(sp_reads.shape[1]), sp_reads.columns]).T
        np.savetxt(
            args.output_prefix + '_sparse_counts_genes.csv', df,
            fmt='%s', delimiter=',')

        log.info('Creating filtered counts matrix.')
        cell_filter_figure = args.output_prefix + '_cell_filters.png'

        # By pass low count filter for mars seq
        sp_csv, total_molecules, molecules_lost, cells_lost, cell_description = (
            filter.create_filtered_dense_count_matrix(
                sp_mols, sp_reads, mini_summary_d, plot=True, figname=cell_filter_figure,
                filter_low_count=platform.filter_low_count,
                filter_mitochondrial_rna=args.filter_mitochondrial_rna,
                filter_low_coverage=args.filter_low_coverage,
                filter_low_gene_abundance=args.filter_low_gene_abundance))

        # Output files
        files = [cell_filter_figure,
                 args.output_prefix + '.h5',
                 args.output_prefix + '_sparse_read_counts.mtx',
                 args.output_prefix + '_sparse_molecule_counts.mtx',
                 args.output_prefix + '_sparse_counts_barcodes.csv',
                 args.output_prefix + '_sparse_counts_genes.csv']

        # Summary sections
        # create the sections for the summary object
        sections += [
            Section.from_cell_filtering(cell_filter_figure, 'cell_filtering.html'),
            Section.from_run_time(args.log_name, 'seqc_log.html')]

        # get alignment summary
        if os.path.isfile(output_dir + '/alignments/Log.final.out'):
            os.rename(output_dir + '/alignments/Log.final.out',
                      output_dir + '/' + args.output_prefix + '_alignment_summary.txt')

            # Upload files and summary sections
            files += [output_dir + '/' + args.output_prefix + '_alignment_summary.txt']
            sections.insert(
                0, Section.from_alignment_summary(
                    output_dir + '/' + args.output_prefix + '_alignment_summary.txt',
                    'alignment_summary.html'))

        cell_size_figure = 'cell_size_distribution.png'
        index_section = Section.from_final_matrix(
            sp_csv, cell_size_figure, 'cell_distribution.html')
        seqc_summary = Summary(
            output_dir + '/' + args.output_prefix + '_summary', sections, index_section)
        seqc_summary.prepare_archive()
        seqc_summary.import_image(cell_filter_figure)
        seqc_summary.import_image(cell_size_figure)
        seqc_summary.render()
        summary_archive = seqc_summary.compress_archive()
        files += [summary_archive]

        # Create a mini summary section
        alignment_summary_file = output_dir + '/' + args.output_prefix + '_alignment_summary.txt'
        seqc_mini_summary = MiniSummary(
            args.output_prefix, mini_summary_d, alignment_summary_file, cell_filter_figure,
            cell_size_figure)
        seqc_mini_summary.compute_summary_fields(ra, sp_csv)
        seqc_mini_summary_json, seqc_mini_summary_pdf = seqc_mini_summary.render()
        files += [seqc_mini_summary_json, seqc_mini_summary_pdf]

        # Running MAST for differential analysis
        # file storing the list of differentially expressed genes for each cluster
        de_gene_list_file = run_mast(
            seqc_mini_summary.get_counts_filtered(), seqc_mini_summary.get_clustering_result(),
            args.output_prefix)
        files += [de_gene_list_file]

        # adding the cluster column and write down gene-cell count matrix
        dense_csv = args.output_prefix + '_dense.csv'
        sp_csv.insert(loc=0, column='CLUSTER', value=seqc_mini_summary.get_clustering_result())
        sp_csv.to_csv(dense_csv)
        files += [dense_csv]

        if args.upload_prefix:
            # Upload count matrices files, logs, and return
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in files:
                try:
                    ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key)
                    item_name = item.split('/')[-1]
                    log.info('Successfully uploaded %s to the specified S3 location '
                             '"%s%s".' % (item, args.upload_prefix, item_name))
                except FileNotFoundError:
                    log.notify('Item %s was not found! Continuing with upload...' % item)

        if manage_merged:
            manage_merged.wait_until_complete()
            log.info('Successfully uploaded %s to the specified S3 location "%s"' %
                     (args.merged_fastq, args.upload_prefix))
        if manage_bamfile:
            manage_bamfile.wait_until_complete()
            log.info('Successfully uploaded %s to the specified S3 location "%s"'
                     % (args.alignment_file, args.upload_prefix))

        log.info('SEQC run complete. Cluster will be terminated')

        # upload logs
        if args.upload_prefix:
            # Upload count matrices files, logs, and return
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in [args.log_name, './nohup.log']:
                try:
                    # Make a copy of the file with the output prefix
                    copyfile(item, args.output_prefix + '_' + item)
                    print(args.output_prefix + '_' + item)
                    ec2.Retry(retries=5)(io.S3.upload_file)(
                        args.output_prefix + '_' + item, bucket, key)
                    log.info('Successfully uploaded %s to the specified S3 location '
                             '"%s".' % (item, args.upload_prefix))
                except FileNotFoundError:
                    log.notify('Item %s was not found! Continuing with upload...' % item)

        # todo local test does not send this email
        if mutt:
            email_body = (
                '<font face="Courier New, Courier, monospace">'
                'SEQC RUN COMPLETE.\n\n'
                'The run log has been attached to this email and '
                'results are now available in the S3 location you specified: '
                '"%s"\n\n' % args.upload_prefix)
            email_body = email_body.replace('\n', '<br>').replace('\t', '&emsp;')
            email_user(summary_archive, email_body, args.email)
Exemplo n.º 20
0
    def setup_seqc(self):
        if self.instance_id is None:
            self.create_instance()
        with SSHConnection(instance_id=self.instance_id,
                           rsa_key=self.rsa_key) as ssh:
            self.mount_volume(ssh)
            log.notify('setting aws credentials.')
            self.set_credentials(ssh)

            try:  # test the installation
                ssh.execute('SEQC -h')
                log.notify('SEQC found preinstalled.')
            except:
                log.notify(
                    'uploading local SEQC installation to remote instance.')
                seqc_distribution = os.path.expanduser('~/.seqc/seqc.tar.gz')
                ssh.execute('mkdir -p software/seqc')
                ssh.put_file(seqc_distribution, 'software/seqc.tar.gz')
                ssh.execute(
                    'tar -m -xvf software/seqc.tar.gz -C software/seqc --strip-components 1'
                )
                log.notify(
                    "Sources are uploaded and decompressed, installing seqc.")
                try:
                    ssh.execute('sudo -H pip3 install -e software/seqc/')
                except ChildProcessError as e:
                    if 'pip install --upgrade pip' in str(e):
                        pass
                    else:
                        raise

                try:  # test the installation
                    ssh.execute('SEQC -h')
                except:
                    log.notify('SEQC installation failed.')
                    log.exception()
                    raise
            log.notify('SEQC setup complete.')
            log.notify('instance login: %s' % ssh.login_command())
Exemplo n.º 21
0
def index(args):
    """create an index for SEQC.

    :param args: parsed arguments. This function is only called if subprocess_name is
      'index'
    """

    # functions to be pickled and run remotely must import all their own modules
    import sys
    import logging
    from seqc import ec2, log, io
    from seqc.sequence.index import Index
    from seqc.alignment import star
    from seqc import version

    logging.basicConfig(
        level=logging.DEBUG,
        handlers=[
            logging.FileHandler(args.log_name),
            logging.StreamHandler(sys.stdout),
        ],
    )

    log.info("SEQC=v{}".format(version.__version__))
    log.info("STAR=v{}".format(star.get_version()))
    log.args(args)

    with ec2.instance_clean_up(
        email=args.email,
        upload=args.upload_prefix,
        log_name=args.log_name,
        debug=args.debug,
        terminate=args.terminate,
        running_remote=args.remote,
    ):

        idx = Index(args.organism, args.ids, args.folder)
        idx.create_index(
            s3_location=args.upload_prefix,
            ensemble_release=args.ensemble_release,
            read_length=args.read_length,
            valid_biotypes=args.valid_biotypes,
        )

        # upload the log file (seqc_log.txt, nohup.log, Log.out)
        if args.upload_prefix:
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in [args.log_name, "./nohup.log", "./Log.out"]:
                try:
                    ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key)
                    log.info(
                        "Successfully uploaded {} to {}".format(
                            item, args.upload_prefix
                        )
                    )
                except FileNotFoundError:
                    log.notify(
                        "Item {} was not found! Continuing with upload...".format(item)
                    )

    log.info("DONE.")
Exemplo n.º 22
0
Arquivo: run.py Projeto: dpeerlab/seqc
def run(args) -> None:
    """Run SEQC on the files provided in args, given specifications provided on the
    command line

    :param args: parsed argv, produced by seqc.parser(). This function is only called
      when args.subprocess_name is "run".
    """

    # import inside module for pickle functionality
    # top 2 only needed for post-filtering

    import os
    import multiprocessing
    from seqc import log, ec2, platforms, io, version
    from seqc.sequence import fastq
    from seqc.alignment import star
    from seqc.alignment import sam
    from seqc.email_ import email_user
    from seqc.read_array import ReadArray
    from seqc.core import verify, download
    from seqc import filter
    from seqc.sequence.gtf import GeneIntervals
    from seqc.summary.summary import Section, Summary
    import numpy as np
    import scipy.io
    from shutil import copyfile
    from shutil import move as movefile
    from seqc.summary.summary import MiniSummary
    from seqc.stats.mast import run_mast
    import logging
    import pickle
    import pendulum

    # logger = logging.getLogger('weasyprint')
    # logger.handlers = []  # Remove the default stderr handler
    # logger.setLevel(100)
    # logger.addHandler(logging.FileHandler('weasyprint.log'))

    def determine_start_point(arguments) -> (bool, bool, bool):
        """
        determine where seqc should start based on which parameters were passed.

        :param arguments: Namespace object, result of ArgumentParser.parse_args()
        :returns merge, align, process_bamfile: indicates whether merging, alignment, and
          processing bamfiles should be executed.
        """
        if arguments.read_array:
            return False, False, False
        if arguments.alignment_file:
            return False, False, True
        if arguments.merged_fastq:
            return False, True, True
        else:
            return True, True, True

    def download_input(dir_, arguments):
        """parse input arguments and download any necessary data

        :param str dir_: directory to download data to
        :param arguments: namespace object from argparse
        :return args: updated namespace object reflecting local file paths of downloaded
          files
        """
        # download basespace data if necessary
        if arguments.basespace:
            arguments.barcode_fastq, arguments.genomic_fastq = io.BaseSpace.download(
                arguments.platform, arguments.basespace, dir_,
                arguments.basespace_token)

        # get a list of input FASTQ files
        # download from AWS S3 if the URI is prefixed with s3://
        arguments.genomic_fastq = download.s3_data(arguments.genomic_fastq,
                                                   dir_ + "/genomic_fastq/")
        arguments.barcode_fastq = download.s3_data(arguments.barcode_fastq,
                                                   dir_ + "/barcode_fastq/")

        # get merged fastq file, unzip if necessary
        arguments.merged_fastq = (download.s3_data(
            [arguments.merged_fastq], dir_ +
            "/")[0] if arguments.merged_fastq is not None else None)

        # get a path to the STAR index files
        # download from AWS S3 if the URI is prefixed with s3://
        if any((arguments.alignment_file, arguments.read_array)):
            index_link = arguments.index + "annotations.gtf"
        else:
            index_link = arguments.index
        index_files = download.s3_data([index_link], dir_ + "/index/")
        # use the first filename in the list to get the index directory
        # add a trailing slash to make the rest of the code not break;;
        # e.g. test-data/index/chrStart.txt --> test-data/index/
        arguments.index = os.path.dirname(index_files[0]) + "/"

        # get a list of whitelisted barcodes files
        # download from AWS S3 if the URI is prefixed with s3://
        arguments.barcode_files = download.s3_data(arguments.barcode_files,
                                                   dir_ + "/barcodes/")

        # check if `alignment_file` is specified
        if arguments.alignment_file:
            # get the alignment filename (*.bam)
            # download from AWS S3 if the URI is prefixed with s3://
            arguments.alignment_file = download.s3_data(
                [arguments.alignment_file], dir_ + "/")[0]

        # check if `read_array` is specified
        if arguments.read_array:
            # get the readarray fileanem (*.h5)
            # download from AWS S3 if the URI is prefixed with s3://
            arguments.read_array = download.s3_data([arguments.read_array],
                                                    dir_ + "/")[0]

        return arguments

    def merge_fastq_files(
        technology_platform,
        barcode_fastq: [str],
        output_stem: str,
        genomic_fastq: [str],
    ) -> (str, int):
        """annotates genomic fastq with barcode information; merging the two files.

        :param technology_platform: class from platforms.py that defines the
          characteristics of the data being processed
        :param barcode_fastq: list of str names of fastq files containing barcode
          information
        :param output_stem: str, stem for output files
        :param genomic_fastq: list of str names of fastq files containing genomic
          information
        :returns str merged_fastq: name of merged fastq file
        """

        # hack:
        # Due to the non-platform agnostic glob behavior,
        # it is possible that L001_R1 is merged with L002_R2 (not L001_R2).
        # to avoid this problem, we first sort.
        # this is a temporary hacky solution
        barcode_fastq = sorted(barcode_fastq)
        genomic_fastq = sorted(genomic_fastq)

        log.info("Merging genomic reads and barcode annotations.")
        for bar_fq, gen_fq in zip(barcode_fastq, genomic_fastq):
            log.info("Merge {} with {}".format(os.path.basename(bar_fq),
                                               os.path.basename(gen_fq)))

        merged_fastq = fastq.merge_paired(
            merge_function=technology_platform.merge_function,
            fout=output_stem + "_merged.fastq",
            genomic=genomic_fastq,
            barcode=barcode_fastq,
        )

        # delete genomic/barcode fastq files after merged.fastq creation
        # log.info('Removing original fastq file for memory management.')
        # delete_fastq = ' '.join(['rm'] + genomic_fastq + barcode_fastq)
        # io.ProcessManager(delete_fastq).run_all()

        return merged_fastq

    def align_fastq_records(merged_fastq, dir_, star_args, star_index, n_proc,
                            aws_upload_key) -> (str, str, io.ProcessManager):
        """
        Align fastq records.

        :param merged_fastq: str, path to merged .fastq file
        :param dir_: str, stem for output files
        :param star_args: dict, extra keyword arguments for STAR
        :param star_index: str, file path to directory containing STAR index
        :param n_proc: int, number of STAR processes to initiate
        :param aws_upload_key: str, location to upload files, or None if seqc was
          initiated from a merged fastq file.
        :return bamfile, input_data, upload_manager: (str, str, io.ProcessManager)
          name of .sam file containing aligned reads, indicator of which data was used as
          input, and a ProcessManager for merged fastq files
        """
        log.info("Aligning merged fastq records.")
        alignment_directory = dir_ + "/alignments/"
        os.makedirs(alignment_directory, exist_ok=True)
        if star_args is not None:
            star_kwargs = dict(a.strip().split("=") for a in star_args)
        else:
            star_kwargs = {}
        bamfile = star.align(merged_fastq, star_index, n_proc,
                             alignment_directory, **star_kwargs)

        log.info("Gzipping merged fastq file.")
        if pigz:
            pigz_zip = "pigz --best -f {fname}".format(fname=merged_fastq)
        else:
            pigz_zip = "gzip -f {fname}".format(fname=merged_fastq)
        pigz_proc = io.ProcessManager(pigz_zip)
        pigz_proc.run_all()
        pigz_proc.wait_until_complete()  # prevents slowing down STAR alignment
        merged_fastq += ".gz"  # reflect gzipped nature of file

        if aws_upload_key:
            log.info("Uploading gzipped merged fastq file to S3.")
            merge_upload = "aws s3 mv {fname} {s3link}".format(
                fname=merged_fastq, s3link=aws_upload_key)
            upload_manager = io.ProcessManager(merge_upload)
            upload_manager.run_all()
        else:
            #     log.info('Removing merged fastq file for memory management.')
            #     rm_merged = 'rm %s' % merged_fastq
            #     io.ProcessManager(rm_merged).run_all()

            upload_manager = None
        return bamfile, upload_manager

    def create_read_array(bamfile, index, aws_upload_key, min_poly_t,
                          max_transcript_length):
        """Create or download a ReadArray object.

        :param max_transcript_length:
        :param str bamfile: filename of .bam file
        :param str index: directory containing index files
        :param str aws_upload_key: key where aws files should be uploaded
        :param int min_poly_t: minimum number of poly_t nucleotides for a read to be valid
        :returns ReadArray, UploadManager: ReadArray object, bamfile ProcessManager
        """
        log.info("Filtering aligned records and constructing record database.")
        # Construct translator
        translator = GeneIntervals(index + "annotations.gtf",
                                   max_transcript_length=max_transcript_length)
        read_array, read_names = ReadArray.from_alignment_file(
            bamfile, translator, min_poly_t)

        # converting sam to bam and uploading to S3, else removing bamfile
        if aws_upload_key:
            log.info("Uploading bam file to S3.")
            upload_bam = "aws s3 mv {fname} {s3link}{prefix}_Aligned.out.bam".format(
                fname=bamfile,
                s3link=aws_upload_key,
                prefix=args.output_prefix)
            print(upload_bam)
            upload_manager = io.ProcessManager(upload_bam)
            upload_manager.run_all()
        else:
            if os.path.exists(bamfile):
                movefile(bamfile, args.output_prefix + "_Aligned.out.bam")
            #     log.info('Removing bamfile for memory management.')
            #     rm_bamfile = 'rm %s' % bamfile
            #     io.ProcessManager(rm_bamfile).run_all()
            upload_manager = None
        return read_array, upload_manager, read_names

    # ######################## MAIN FUNCTION BEGINS HERE ################################

    log.setup_logger(args.log_name, args.debug)

    with ec2.instance_clean_up(
            email=args.email,
            upload=args.upload_prefix,
            log_name=args.log_name,
            debug=args.debug,
            terminate=args.terminate,
            running_remote=args.remote,
    ):

        start_run_time = pendulum.now()

        log.notify("SEQC=v{}".format(version.__version__))
        log.notify("STAR=v{}".format(star.get_version()))
        log.notify("samtools=v{}".format(sam.get_version()))

        pigz, mutt = verify.executables("pigz", "mutt")
        if mutt:
            log.notify(
                "mutt executable identified, email will be sent when run "
                "terminates. ")
        else:
            log.notify(
                "mutt was not found on this machine; an email will not be sent to "
                "the user upon termination of SEQC run.")

        # turn off lower coverage filter for 10x
        if ((args.platform == "ten_x") or (args.platform == "ten_x_v2")
                or (args.platform == "ten_x_v3")):
            args.filter_low_coverage = False

        if args.platform == "ten_x_v2" or args.platform == "ten_x_v3":
            log.notify("Setting min_poly_t=0 for 10x v2 & v3")
            args.min_poly_t = 0

        max_insert_size = args.max_insert_size
        if args.filter_mode == "scRNA-seq":
            # for scRNA-seq
            if ((args.platform == "ten_x") or (args.platform == "ten_x_v2")
                    or (args.platform == "ten_x_v3")):
                # set max_transcript_length (max_insert_size) = 10000
                max_insert_size = 10000
                log.notify(
                    "Full length transcripts are used for read mapping in 10x data."
                )
        elif args.filter_mode == "snRNA-seq":
            # for snRNA-seq
            # e.g. 2304700 # hg38
            # e.g. 4434881 # mm38
            max_insert_size = args.max_insert_size
        else:
            # all others
            max_insert_size = args.max_insert_size

        log.notify("max_insert_size is set to {}".format(max_insert_size))

        log.args(args)

        # e.g.
        # --output-prefix=test-data/_outs/test
        # output_dir=test-data
        # output_prefix=test
        output_dir, output_prefix = os.path.split(args.output_prefix)
        if not output_dir:
            output_dir = "."
        else:
            os.makedirs(output_dir, exist_ok=True)

        # check if the platform name provided is supported by seqc
        # todo move into verify for run
        platform_name = verify.platform_name(args.platform)
        platform = platforms.AbstractPlatform.factory(
            platform_name)  # returns platform

        n_processes = multiprocessing.cpu_count(
        ) - 1  # get number of processors

        merge, align, process_bamfile = determine_start_point(args)

        args = download_input(output_dir, args)

        if args.platform == "in_drop_v5":
            platform = platform.build_cb2_barcodes(args.barcode_files)
            log.notify("Built cb2 barcode hash for v5 barcodes.")

        if merge:
            if args.min_poly_t is None:  # estimate min_poly_t if it was not provided
                args.min_poly_t = filter.estimate_min_poly_t(
                    args.barcode_fastq, platform)
                log.notify("Estimated min_poly_t={!s}".format(args.min_poly_t))

            args.merged_fastq = merge_fastq_files(platform, args.barcode_fastq,
                                                  args.output_prefix,
                                                  args.genomic_fastq)

        # SEQC was started from input other than fastq files
        if args.min_poly_t is None:
            args.min_poly_t = 0
            log.warn(
                "Warning: SEQC started from step other than unmerged fastq with "
                "empty --min-poly-t parameter. Continuing with --min-poly-t 0."
            )

        if align:
            upload_merged = args.upload_prefix if merge else None
            args.alignment_file, manage_merged = align_fastq_records(
                args.merged_fastq,
                output_dir,
                args.star_args,
                args.index,
                n_processes,
                upload_merged,
            )
        else:
            manage_merged = None

        if process_bamfile:
            # if the starting point was a BAM file (i.e. args.alignment_file=*.bam & align=False)
            # do not upload by setting this to None
            upload_bamfile = args.upload_prefix if align else None

            ra, manage_bamfile, read_names = create_read_array(
                args.alignment_file,
                args.index,
                upload_bamfile,
                args.min_poly_t,
                max_insert_size,
            )
        else:
            manage_bamfile = None
            ra = ReadArray.load(args.read_array)
            # fixme: the old read_array doesn't have read_names
            read_names = None

        # create the first summary section here
        status_filters_section = Section.from_status_filters(
            ra, "initial_filtering.html")
        sections = [status_filters_section]

        # Skip over the corrections if read array is specified by the user
        if not args.read_array:

            # Correct barcodes
            log.info("Correcting barcodes and estimating error rates.")
            error_rate, df_cb_correction = platform.apply_barcode_correction(
                ra, args.barcode_files)
            if df_cb_correction is not None and len(df_cb_correction) > 0:
                df_cb_correction.to_csv(
                    args.output_prefix + "_cb-correction.csv.gz",
                    index=False,
                    compression="gzip",
                )

            # Resolve multimapping
            log.info("Resolving ambiguous alignments.")
            mm_results = ra.resolve_ambiguous_alignments()

            # 121319782799149 / 614086965 / pos=49492038 / AAACATAACG
            # 121319782799149 / 512866590 / pos=49490848 / TCAATTAATC (1 hemming dist away from TCAATTAATT)
            # ra.data["rmt"][91490] = 512866590
            # ra.positions[91490] = 49492038

            # correct errors
            log.info("Identifying RMT errors.")
            df_umi_correction = platform.apply_rmt_correction(ra, error_rate)
            if df_umi_correction is not None and len(df_umi_correction) > 0:
                df_umi_correction.to_csv(
                    args.output_prefix + "_umi-correction.csv.gz",
                    index=False,
                    compression="gzip",
                )

            # Apply low coverage filter
            if platform.filter_lonely_triplets:
                log.info("Filtering lonely triplet reads")
                ra.filter_low_coverage(alpha=args.low_coverage_alpha)

            log.info("Saving read array.")
            ra.save(args.output_prefix + ".h5")

            # generate a file with read_name, corrected cb, corrected umi
            # read_name already has pre-corrected cb & umi
            # log.info("Saving correction information.")
            # ra.create_readname_cb_umi_mapping(
            #     read_names, args.output_prefix + "_correction.csv.gz"
            # )

            # Summary sections
            # create the sections for the summary object
            sections += [
                Section.from_cell_barcode_correction(
                    ra, "cell_barcode_correction.html"),
                Section.from_rmt_correction(ra, "rmt_correction.html"),
                Section.from_resolve_multiple_alignments(
                    mm_results, "multialignment.html"),
            ]

        # create a dictionary to store output parameters
        mini_summary_d = dict()

        # filter non-cells
        log.info("Creating counts matrix.")
        sp_reads, sp_mols = ra.to_count_matrix(sparse_frame=True,
                                               genes_to_symbols=args.index +
                                               "annotations.gtf")

        # generate 10x compatible count matrix
        log.info("Creating 10x compatible counts matrix.")
        ra.to_10x_count_matrix(genes_to_symbols=args.index + "annotations.gtf")

        # Save sparse matrices
        log.info("Saving sparse matrices")
        scipy.io.mmwrite(args.output_prefix + "_sparse_read_counts.mtx",
                         sp_reads.data)
        scipy.io.mmwrite(args.output_prefix + "_sparse_molecule_counts.mtx",
                         sp_mols.data)
        # Indices
        df = np.array([np.arange(sp_reads.shape[0]), sp_reads.index]).T
        np.savetxt(
            args.output_prefix + "_sparse_counts_barcodes.csv",
            df,
            fmt="%d",
            delimiter=",",
        )
        # Columns
        df = np.array([np.arange(sp_reads.shape[1]), sp_reads.columns]).T
        np.savetxt(args.output_prefix + "_sparse_counts_genes.csv",
                   df,
                   fmt="%s",
                   delimiter=",")

        log.info("Creating filtered counts matrix.")
        cell_filter_figure = args.output_prefix + "_cell_filters.png"

        # By pass low count filter for mars seq
        (
            sp_csv,
            total_molecules,
            molecules_lost,
            cells_lost,
            cell_description,
        ) = filter.create_filtered_dense_count_matrix(
            sp_mols,
            sp_reads,
            mini_summary_d,
            plot=True,
            figname=cell_filter_figure,
            filter_low_count=platform.filter_low_count,
            filter_mitochondrial_rna=args.filter_mitochondrial_rna,
            filter_low_coverage=args.filter_low_coverage,
            filter_low_gene_abundance=args.filter_low_gene_abundance,
        )

        # Output files
        files = [
            cell_filter_figure,
            args.output_prefix + ".h5",
            args.output_prefix + "_sparse_read_counts.mtx",
            args.output_prefix + "_sparse_molecule_counts.mtx",
            args.output_prefix + "_sparse_counts_barcodes.csv",
            args.output_prefix + "_sparse_counts_genes.csv",
            "raw_feature_bc_matrix/matrix.mtx.gz",
            "raw_feature_bc_matrix/barcodes.tsv.gz",
            "raw_feature_bc_matrix/features.tsv.gz",
        ]

        if os.path.exists(args.output_prefix + "_cb-correction.csv.gz"):
            files.append(args.output_prefix + "_cb-correction.csv.gz")
        if os.path.exists(args.output_prefix + "_umi-correction.csv.gz"):
            files.append(args.output_prefix + "_umi-correction.csv.gz")

        # Summary sections
        # create the sections for the summary object
        sections += [
            Section.from_cell_filtering(cell_filter_figure,
                                        "cell_filtering.html"),
            Section.from_run_time(args.log_name, "seqc_log.html"),
        ]

        # get alignment summary
        if os.path.isfile(output_dir + "/alignments/Log.final.out"):
            os.rename(
                output_dir + "/alignments/Log.final.out",
                args.output_prefix + "_alignment_summary.txt",
            )

            # Upload files and summary sections
            files += [args.output_prefix + "_alignment_summary.txt"]
            sections.insert(
                0,
                Section.from_alignment_summary(
                    args.output_prefix + "_alignment_summary.txt",
                    "alignment_summary.html",
                ),
            )

        cell_size_figure = args.output_prefix + "_cell_size_distribution.png"
        index_section = Section.from_final_matrix(sp_csv, cell_size_figure,
                                                  "cell_distribution.html")
        seqc_summary = Summary(args.output_prefix + "_summary", sections,
                               index_section)
        seqc_summary.prepare_archive()
        seqc_summary.import_image(cell_filter_figure)
        seqc_summary.import_image(cell_size_figure)
        seqc_summary.render()

        # create a .tar.gz with `test_summary/*`
        summary_archive = seqc_summary.compress_archive()
        files += [summary_archive]

        # Create a mini summary section
        alignment_summary_file = args.output_prefix + "_alignment_summary.txt"
        seqc_mini_summary = MiniSummary(
            output_dir,
            output_prefix,
            mini_summary_d,
            alignment_summary_file,
            cell_filter_figure,
            cell_size_figure,
        )
        seqc_mini_summary.compute_summary_fields(ra, sp_csv)
        seqc_mini_summary_json, seqc_mini_summary_pdf = seqc_mini_summary.render(
        )
        files += [seqc_mini_summary_json, seqc_mini_summary_pdf]

        # Running MAST for differential analysis
        # file storing the list of differentially expressed genes for each cluster
        de_gene_list_file = run_mast(
            seqc_mini_summary.get_counts_filtered(),
            seqc_mini_summary.get_clustering_result(),
            args.output_prefix,
        )
        files += [de_gene_list_file]

        # adding the cluster column and write down gene-cell count matrix
        dense_csv = args.output_prefix + "_dense.csv"
        sp_csv.insert(loc=0,
                      column="CLUSTER",
                      value=seqc_mini_summary.get_clustering_result())
        sp_csv.to_csv(dense_csv)
        files += [dense_csv]

        if args.upload_prefix:
            # Upload count matrices files, logs, and return
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in files:
                try:
                    ec2.Retry(retries=5)(io.S3.upload_file)(item, bucket, key)
                    item_name = item.split("/")[-1]
                    log.info('Successfully uploaded %s to "%s%s".' %
                             (item, args.upload_prefix, item_name))
                except FileNotFoundError:
                    log.notify(
                        "Item %s was not found! Continuing with upload..." %
                        item)

        if manage_merged:
            manage_merged.wait_until_complete()
            log.info('Successfully uploaded %s to "%s"' %
                     (args.merged_fastq, args.upload_prefix))
        if manage_bamfile:
            manage_bamfile.wait_until_complete()
            log.info('Successfully uploaded %s to "%s"' %
                     (args.alignment_file, args.upload_prefix))

        log.info("SEQC run complete.")

        end_run_time = pendulum.now()
        running_time = end_run_time - start_run_time
        log.info("Running Time={}".format(running_time.in_words()))

        # upload logs
        if args.upload_prefix:
            # upload logs (seqc_log.txt, nohup.log)
            bucket, key = io.S3.split_link(args.upload_prefix)
            for item in [args.log_name, "./nohup.log"]:
                try:
                    # Make a copy of the file with the output prefix
                    copyfile(item, args.output_prefix + "_" + item)
                    print(args.output_prefix + "_" + item)
                    ec2.Retry(retries=5)(io.S3.upload_file)(
                        args.output_prefix + "_" + item, bucket, key)
                    log.info('Successfully uploaded %s to "%s".' %
                             (item, args.upload_prefix))
                except FileNotFoundError:
                    log.notify(
                        "Item %s was not found! Continuing with upload..." %
                        item)
        else:
            # move the log to output directory
            movefile(args.log_name, args.output_prefix + "_" + args.log_name)

        # todo local test does not send this email
        if mutt:
            email_body = (
                '<font face="Courier New, Courier, monospace">'
                "SEQC RUN COMPLETE.\n\n"
                "The run log has been attached to this email and "
                "results are now available in the S3 location you specified: "
                '"%s"\n\n' % args.upload_prefix)
            email_body = email_body.replace("\n",
                                            "<br>").replace("\t", "&emsp;")
            email_user(summary_archive, email_body, args.email)
Exemplo n.º 23
0
def low_coverage(molecules,
                 reads,
                 is_invalid,
                 plot=False,
                 ax=None,
                 filter_on=True):
    """
    Fits a two-component gaussian mixture model to the data. If a component is found
    to fit a low-coverage fraction of the data, this fraction is set as invalid. Not
    all datasets contain this fraction.

    For best results, should be run after filter.low_count()

    :param molecules: scipy.stats.coo_matrix, molecule count matrix
    :param reads: scipy.stats.coo_matrix, read count matrix
    :param is_invalid:  np.ndarray(dtype=bool), declares valid and invalid cells
    :param bool plot: if True, plot a summary of the filter
    :param ax: Must be passed if plot is True. Indicates the axis on which to plot the
      summary.
    :param filter_on: indicate whether low coverage filter is on
    :return: is_invalid, np.ndarray(dtype=bool), updated valid and invalid cells
    """
    ms = np.ravel(molecules.tocsr()[~is_invalid, :].sum(axis=1))
    rs = np.ravel(reads.tocsr()[~is_invalid, :].sum(axis=1))

    if ms.shape[0] < 10 or rs.shape[0] < 10:
        log.notify(
            'Low coverage filter passed-through; too few cells to calculate '
            'mixture model.')
        return is_invalid

    # get read / cell ratio, filter out low coverage cells
    ratio = rs / ms

    # fit two GMMs on one and two modes
    col_ratio = ratio[:, np.newaxis]
    gmm1 = GaussianMixture(n_components=1)
    gmm2 = GaussianMixture(n_components=2)
    gmm1.fit(col_ratio)
    gmm2.fit(col_ratio)

    if filter_on:
        # check if adding a second component is necessary; if not, filter is pass-through
        filter_on = gmm2.bic(col_ratio) / gmm1.bic(col_ratio) < 0.95

    if filter_on:
        res = gmm2.predict(col_ratio)

        # Molecule sum means
        means = [np.mean(ms[res == 0]), np.mean(ms[res == 1])]
        failing = np.where(res == np.argmin(means))[0]

        # set smaller mean as invalid
        is_invalid = is_invalid.copy()
        is_invalid[np.where(~is_invalid)[0][failing]] = True
    else:
        res, means = None, None

    if plot and ax:
        logms = np.log10(ms)
        try:
            seqc.plot.scatter.continuous(logms,
                                         ratio,
                                         colorbar=False,
                                         ax=ax,
                                         s=3)
        except LinAlgError:
            warnings.warn(
                'SEQC: Insufficient number of cells to calculate density for '
                'coverage plot')
            ax.scatter(logms, ratio, s=3)
        ax.set_xlabel('log10(molecules)')
        ax.set_ylabel('reads / molecule')
        if filter_on:
            ax.set_title('Coverage: {:.2}%'.format(
                np.sum(failing) / len(failing) * 100))
        else:
            ax.set_title('Coverage')
        xmin, xmax = np.min(logms), np.max(logms)
        ymax = np.max(ratio)
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((0, ymax))
        seqc.plot.xtick_vertical(ax=ax)

        # plot 1d conditional densities of two-component model
        # todo figure out how to do this!!

        # plot the discarded cells in red, like other filters
        if filter_on:
            ax.scatter(logms[res == np.argmin(means)],
                       ratio[res == np.argmin(means)],
                       s=4,
                       c='indianred')

    return is_invalid
Exemplo n.º 24
0
 def __enter__(self):
     log.notify('connecting to instance %s via ssh' % self.instance_id)
     self.connect()
     return self
Exemplo n.º 25
0
def high_mitochondrial_rna(molecules,
                           gene_ids,
                           is_invalid,
                           mini_summary_d,
                           max_mt_content=0.2,
                           plot=False,
                           ax=None,
                           filter_on=True):
    """
    Sets any cell with a fraction of mitochondrial mRNA greater than max_mt_content to
    invalid.

    :param molecules: scipy.stats.coo_matrix, molecule count matrix
    :param gene_ids: np.ndarray(dtype=str) containing string gene identifiers
    :param is_invalid:  np.ndarray(dtype=bool), declares valid and invalid cells
    :param max_mt_content: float, maximum percentage of reads that can come from
      mitochondria in a valid cell
    :param bool plot: if True, plot a summary of the filter
    :param ax: Must be passed if plot is True. Indicates the axis on which to plot the
      summary.
    :param mini_summary_d: a dictionary to store output parameters
    :return: is_invalid, np.ndarray(dtype=bool), updated valid and invalid cells
    """
    # identify % genes that are mitochondrial
    mt_genes = np.fromiter(map(lambda x: x.startswith('MT-'), gene_ids),
                           dtype=np.bool)
    mt_molecules = np.ravel(
        molecules.tocsr()[~is_invalid, :].tocsc()[:, mt_genes].sum(axis=1))
    ms = np.ravel(molecules.tocsr()[~is_invalid, :].sum(axis=1))
    ratios = mt_molecules / ms

    if filter_on:
        failing = ratios > max_mt_content
        is_invalid = is_invalid.copy()
        is_invalid[np.where(~is_invalid)[0][failing]] = True
    else:
        is_invalid = is_invalid.copy()

    if plot and ax:
        if ms.shape[0] and ratios.shape[0]:
            try:
                seqc.plot.scatter.continuous(ms,
                                             ratios,
                                             colorbar=False,
                                             ax=ax,
                                             s=3)
            except LinAlgError:
                log.notify(
                    'Inadequate number of cells or MT gene abundance to plot MT '
                    'filter, no visual will be produced, but filter has been '
                    'applied.')
                return is_invalid
        else:
            return is_invalid  # nothing else to do here
        if filter_on and (np.sum(failing) != 0):
            ax.scatter(ms[failing], ratios[failing], c='indianred',
                       s=3)  # failing cells
        xmax = np.max(ms)
        ymax = np.max(ratios)
        ax.set_xlim((0, xmax))
        ax.set_ylim((0, ymax))
        ax.hlines(max_mt_content,
                  *ax.get_xlim(),
                  linestyle='--',
                  colors='indianred')
        ax.set_xlabel('total molecules')
        ax.set_ylabel('mtRNA fraction')
        if filter_on:
            ax.set_title('mtRNA Fraction: {:.2}%'.format(
                np.sum(failing) / len(failing) * 100))
            mini_summary_d['mt_rna_fraction'] = (np.sum(failing) * 1.0 /
                                                 len(failing)) * 100.0
        else:
            ax.set_title('mtRNA Fraction')
            mini_summary_d['mt_rna_fraction'] = 0.0
        seqc.plot.xtick_vertical(ax=ax)

    return is_invalid
Exemplo n.º 26
0
def low_gene_abundance(molecules,
                       is_invalid,
                       plot=False,
                       ax=None,
                       filter_on=True):
    """
    Fits a linear model to the relationship between number of genes detected and number
    of molecules detected. Cells with a lower than expected number of detected genes
    are set as invalid.

    :param molecules: scipy.stats.coo_matrix, molecule count matrix
    :param is_invalid:  np.ndarray(dtype=bool), declares valid and invalid cells
    :param bool plot: if True, plot a summary of the filter
    :param ax: Must be passed if plot is True. Indicates the axis on which to plot the
      summary.
    :return: is_invalid, np.ndarray(dtype=bool), updated valid and invalid cells
    """

    ms = np.ravel(molecules.tocsr()[~is_invalid, :].sum(axis=1))
    genes = np.ravel(molecules.tocsr()[~is_invalid, :].getnnz(axis=1))
    x = np.log10(ms)[:, np.newaxis]
    y = np.log10(genes)

    if not (x.shape[0] or y.shape[0]):
        return is_invalid

    # get line of best fit
    with warnings.catch_warnings(
    ):  # ignore scipy LinAlg warning about LAPACK bug.
        warnings.simplefilter('ignore')
        regr = LinearRegression()
        regr.fit(x, y)

    # mark large residuals as failing
    yhat = regr.predict(x)
    residuals = yhat - y
    failing = residuals > .15

    is_invalid = is_invalid.copy()
    if filter_on:
        is_invalid[np.where(~is_invalid)[0][failing]] = True

    if plot and ax:
        m, b = regr.coef_, regr.intercept_
        try:
            seqc.plot.scatter.continuous(x, y, ax=ax, colorbar=False, s=3)
        except LinAlgError:
            log.notify(
                'Inadequate number of cells to plot low coverage filter no visual '
                'will be produced, but filter has been applied.')
            return is_invalid
        xmin, xmax = np.min(x), np.max(x)
        ymin, ymax = np.min(y), np.max(y)
        lx = np.linspace(xmin, xmax, 200)
        ly = m * lx + b
        ax.plot(lx, np.ravel(ly), linestyle='--', c='indianred')
        if filter_on:
            ax.scatter(x[failing], y[failing], c='indianred', s=3)
        ax.set_ylim((ymin, ymax))
        ax.set_xlim((xmin, xmax))
        ax.set_xlabel('molecules (cell)')
        ax.set_ylabel('genes (cell)')
        if filter_on:
            ax.set_title('Low Complexity: {:.2}%'.format(
                np.sum(failing) / len(failing) * 100))
        else:
            ax.set_title('Low Complexity')
        seqc.plot.xtick_vertical(ax=ax)

    return is_invalid
Exemplo n.º 27
0
 def remove_security_group(cls, security_group_id) -> None:
     cls.ec2.SecurityGroup(security_group_id).delete()
     log.notify('security group %s successfully removed.' %
                (security_group_id))
Exemplo n.º 28
0
 def start(self):
     self.setup_seqc()
     log.notify('Instance set-up complete.')
Exemplo n.º 29
0
    def setup_seqc(self):

        if self.instance_id is None:
            self.create_instance()

        # tag the instance
        tags = self.construct_ec2_tags()

        self.ec2.create_tags(Resources=[self.instance_id], Tags=tags)

        with SSHConnection(instance_id=self.instance_id,
                           rsa_key=self.rsa_key) as ssh:

            self.mount_volume(ssh)

            log.notify('setting aws credentials.')
            self.set_credentials(ssh)

            # use the local SEQC package (.tar.gz) to update the remote instance
            # this will overwrite whatever SEQC version exists in the remote instance
            if self.remote_update:

                log.notify(
                    'uploading local SEQC installation to remote instance.')
                seqc_distribution = os.path.expanduser('~/.seqc/seqc.tar.gz')
                ssh.execute('mkdir -p software/seqc')
                ssh.put_file(seqc_distribution, 'software/seqc.tar.gz')
                ssh.execute(
                    'tar -m -xvf software/seqc.tar.gz -C software/seqc --strip-components 1'
                )
                log.notify(
                    "Sources are uploaded and decompressed, installing seqc.")

                try:
                    ssh.execute('sudo -H pip3 install software/seqc/')
                except ChildProcessError as e:
                    if 'pip install --upgrade pip' in str(e):
                        pass
                    else:
                        raise

                try:  # test the installation
                    ssh.execute('SEQC -h')
                except:
                    log.notify('SEQC installation failed.')
                    log.exception()
                    raise

            try:
                # retrieves the SEQC version information
                seqc_version, _ = ssh.execute('SEQC --version')
                # this returns an array
                seqc_version = seqc_version[0]

                # update the Name tag (e.g. SEQC 0.2.3)
                self.ec2.create_tags(Resources=[self.instance_id],
                                     Tags=[{
                                         "Key": "Name",
                                         "Value": seqc_version
                                     }])
            except:
                # just warn and proceed
                log.notify("Unable to retrieve SEQC version.")

            log.notify('SEQC setup complete.')
            log.notify('instance login: %s' % ssh.obscure_login_command())
Exemplo n.º 30
0
 def verify_instance_running(self, instance_id):
     """wait for instance to reach 'running' state, then return"""
     instance = self.ec2.Instance(id=instance_id)
     if not instance.state['Name'] == 'running':
         raise InstanceNotRunningError
     log.notify('instance %s in running state' % instance_id)