Exemplo n.º 1
0
 def __subset_check(subset):
     subset = subset.upper()
     if subset[0] not in ['N', 'T']:
         tprint(
             'Locus() error: Please specify (N)ormal or (T)umor as subset')
         return False
     return True
Exemplo n.º 2
0
 def correct_off_by_one_errors(self, locus):
     # Generate the locus position by adding a 1 basepair padding around the locus
     # coordinates, which accounts for the off-by-one errors.
     position = '{0}:{1}-{2}'.format(locus.chromosome, 
         locus.start - 1, 
         locus.end + 1)
     
     raw_sequence = self.get_sequence(position)
     #accommodate newer versions of PySam, which causes this to return chromosome and position
     raw_sequence = raw_sequence.split(":")[-1]
     sequence = MSILocusLoader.strip_coord_re.sub("", raw_sequence)
     if sequence[1:1+locus.kmer_length] != locus.kmer:
         # Sequence doesn't start where expected; shift accordingly.
         if sequence[0:locus.kmer_length] == locus.kmer:
             # Shift back by one
             locus.start -= 1
             locus.end -= 1
         elif sequence[2:2+locus.kmer_length] == locus.kmer:
             # Shift forward by one
             locus.start += 1
             locus.end += 1
         else:
             tprint('Error: Specified locus does not appear '
                 + ' to be the starting point for kmer {kmer}.'.format(
                     kmer=locus.kmer))
Exemplo n.º 3
0
    def __init__(self, config):
        self.genome_path = None

        if 'genome' in config:
            self.genome_path = config['genome']
        else:
            tprint('MSILocusLoader warning: You need to specify a path' +
                   ' to the reference genome!')
Exemplo n.º 4
0
    def __init__(self, config):
        self.genome_path = None

        if 'genome' in config:
            self.genome_path = config['genome']
        else:
            tprint('MSILocusLoader warning: You need to specify a path' +
                ' to the reference genome!')
Exemplo n.º 5
0
def startRunning():
    """ Starts the autossh session. """
    
    try:
        check_output(CLI_START)
        tprint("Started autossh")

    except CalledProcessError as e:
        tprint("Error when trying to start autossh : %s" % e)
Exemplo n.º 6
0
def checkIfRunning():
    """ Checks if an autossh process is currently running; starts one if not. """
    
    try:
        check_output(CLI_CHECK)

    except CalledProcessError:
        tprint("autossh is not running; restarting")
        startRunning()
Exemplo n.º 7
0
 def get_sequence(self, locus):
     if self.genome_path is None:
         tprint('MSILocusLoader error: Can not use .get_sequence() without' 
             + ' specifying the path to the reference genome!')
         exit(1)
 
     sequence = []
     for subseq in pysam.faidx(self.genome_path, locus)[1:]:
         sequence.append(subseq.strip())
     return ''.join(sequence).upper()
Exemplo n.º 8
0
    def get_sequence(self, locus):
        if self.genome_path is None:
            tprint(
                'MSILocusLoader error: Can not use .get_sequence() without' +
                ' specifying the path to the reference genome!')
            exit(1)

        sequence = []
        for subseq in pysam.faidx(self.genome_path, locus)[1:]:
            sequence.append(subseq.strip())
        return ''.join(sequence).upper()
Exemplo n.º 9
0
def do_reboot():
    """ Reboots machine. """

    try:
        check_output(['sudo', 'reboot'])

    except CalledProcessError:
        tprint("Error when trying to reboot")
        
    except KeyboardInterrupt:
        tprint("User aborted reboot")
Exemplo n.º 10
0
    def status_check(self, qsize):
        ok = '\033[92m'
        reset = '\033[0m'
        fail = '\033[91m'
        if qsize is 0:
            queue_status = 'EMPTY'
        else:
            queue_status = '{0} ITEMS'.format(qsize)

        if self.has_live_producer() and not self.has_live_consumers():
                tprint(fail + 'Main> Analyzer(s) LIVE, Extractor DEAD, Queue ' + 
                    queue_status + '. ' + reset)
                tprint(fail + 'Main> Teriminating process due to ' + 
                    'multiprocessing failure.' + reset)
                exit(1)
Exemplo n.º 11
0
    def status_check(self, qsize):
        ok = '\033[92m'
        reset = '\033[0m'
        fail = '\033[91m'
        if qsize is 0:
            queue_status = 'EMPTY'
        else:
            queue_status = '{0} ITEMS'.format(qsize)

        if self.has_live_producer() and not self.has_live_consumers():
            tprint(fail + 'Main> Analyzer(s) LIVE, Extractor DEAD, Queue ' +
                   queue_status + '. ' + reset)
            tprint(fail + 'Main> Teriminating process due to ' +
                   'multiprocessing failure.' + reset)
            exit(1)
Exemplo n.º 12
0
    def read_analyzer(self, queue_in, queue_out, full, empty, mutex_in,
                      mutex_out, queue_full):
        query_delay = 0.100  # in seconds
        n = 0
        if self.debug_output:
            tprint('Analyzer> Thread {0} started.'.format(os.getpid()))
        while True:
            n += 1
            if self.debug_output and (n % 10000 is 0):
                tprint('Analyzer> Thread {0}'.format(os.getpid()) +
                       ' still ALIVE, loop {0}'.format(n))

            full.acquire()
            mutex_in.acquire()
            item = queue_in.get()
            locus = item[0]
            read = item[1]
            mutex_in.release()
            empty.release()

            if not locus:
                # Element was set to FALSE; signals thread termination.
                if self.debug_output:
                    tprint('Analyzer> Thread {0}'.format(os.getpid()) +
                           ' received termination signal.')
                break

            if self.passes_qc_filter(read):
                # Get the repeat count for the repeat unit, along with the
                # internal offset (how many bases into the read the locus
                # actually starts at).
                repeat_count, repeat_start = self.locus_repeat_count(
                    read, locus)

                # A repeat_count of -1 would mean the read did not contain
                # the target locus, or the locus position within the read
                # could not be determined.
                if repeat_count >= 0:
                    # Figure out where the repeat region ends (offset-wise).
                    repeat_end = repeat_start + (repeat_count *
                                                 locus.kmer_length)

                    # Make sure the locus itself has sufficient quality.
                    if self.passes_locus_qc_filter(read, repeat_start,
                                                   repeat_end):
                        # Passed locus QC filter; read is processed. Acquire semaphore
                        # to return repeat count as output.
                        queue_full.acquire()
                        mutex_out.acquire()
                        queue_out.put([locus.locus(), repeat_count])
                        mutex_out.release()

        if self.debug_output:
            tprint('Analyzer> Ending thread {0}'.format(os.getpid()))
        return True
Exemplo n.º 13
0
    def read_analyzer(self, queue_in, queue_out, full, empty, mutex_in, mutex_out, queue_full):
        query_delay = 0.100 # in seconds
        n = 0
        if self.debug_output:
            tprint('Analyzer> Thread {0} started.'.format(os.getpid()))
        while True:
            n += 1
            if self.debug_output and (n % 10000 is 0):
                tprint('Analyzer> Thread {0}'.format(os.getpid()) +
                    ' still ALIVE, loop {0}'.format(n))


            full.acquire()
            mutex_in.acquire()
            item = queue_in.get()
            locus = item[0]
            read = item[1]
            mutex_in.release()
            empty.release()   

            if not locus:
                # Element was set to FALSE; signals thread termination.
                if self.debug_output:
                    tprint('Analyzer> Thread {0}'.format(os.getpid()) +
                    ' received termination signal.')
                break

            if self.passes_qc_filter(read):
                # Get the repeat count for the repeat unit, along with the
                # internal offset (how many bases into the read the locus
                # actually starts at).
                repeat_count, repeat_start = self.locus_repeat_count(read, locus)
                    
                # A repeat_count of -1 would mean the read did not contain
                # the target locus, or the locus position within the read
                # could not be determined.
                if repeat_count >= 0:
                    # Figure out where the repeat region ends (offset-wise).
                    repeat_end = repeat_start + (repeat_count * locus.kmer_length)

                    # Make sure the locus itself has sufficient quality.
                    if self.passes_locus_qc_filter(read, repeat_start, repeat_end):
                        # Passed locus QC filter; read is processed. Acquire semaphore
                        # to return repeat count as output.
                        queue_full.acquire()
                        mutex_out.acquire()
                        queue_out.put([locus.locus(), repeat_count])
                        mutex_out.release()

        if self.debug_output:
            tprint('Analyzer> Ending thread {0}'.format(os.getpid()))
        return True   
Exemplo n.º 14
0
    def add(self, line):
        if self.is_normalized():
            tprint('Error: Cannot add more data once data has been normalized.')
            return False

        line = line.strip().split()
        if line[0].lower() != self.locus().lower():
            tprint('Error: Invalid locus specified ' 
                + '(expected {0}, got {1}'.format(self.locus(), line[0]))
            return False

        # Each line is expected to be in the format of:
        # locus     k   normal  tumor

        k = int(line[1])
        self.__k.add(k)
        self.__normal[k] = int(line[2])
        self.__tumor[k] = int(line[3])
        self.up_to_date = False
        return True
Exemplo n.º 15
0
        def add(self, line):
            if type(line) is str:
                line = line.strip().split('\t')
            if self.locus == '':
                self.locus = line[0]
            elif line[0] != self.locus:
                tprint('MANTIS_Filter.Locus error: Fed data for ' +
                       '{0} to locus {1}!'.format(line[0], self.locus))
                return False
            k = int(line[1])
            n = int(line[2])
            t = int(line[3])
            if k in self.k:
                tprint('MANTIS_Filter.Locus error: Duplicate entry for ' +
                       'repeat count {0}!'.format(k))
                return False

            self.k.add(k)
            self.n[k] = n
            self.t[k] = t
            return True
Exemplo n.º 16
0
        def add(self, line):
            if type(line) is str:
                line = line.strip().split('\t')
            if self.locus == '':
                self.locus = line[0]
            elif line[0] != self.locus:
                tprint('MANTIS_Filter.Locus error: Fed data for ' + 
                    '{0} to locus {1}!'.format(line[0], self.locus))
                return False
            k = int(line[1])
            n = int(line[2])
            t = int(line[3])
            if k in self.k:
                tprint('MANTIS_Filter.Locus error: Duplicate entry for ' + 
                    'repeat count {0}!'.format(k))
                return False

            self.k.add(k)
            self.n[k] = n
            self.t[k] = t
            return True
Exemplo n.º 17
0
    def correct_off_by_one_errors(self, locus):
        # Generate the locus position by adding a 1 basepair padding around the locus
        # coordinates, which accounts for the off-by-one errors.
        position = '{0}:{1}-{2}'.format(locus.chromosome, locus.start - 1,
                                        locus.end + 1)

        sequence = self.get_sequence(position)
        if sequence[1:1 + locus.kmer_length] != locus.kmer:
            # Sequence doesn't start where expected; shift accordingly.
            if sequence[0:locus.kmer_length] == locus.kmer:
                # Shift back by one
                locus.start -= 1
                locus.end -= 1
            elif sequence[2:2 + locus.kmer_length] == locus.kmer:
                # Shift forward by one
                locus.start += 1
                locus.end += 1
            else:
                tprint('Error: Specified locus does not appear ' +
                       ' to be the starting point for kmer {kmer}.'.format(
                           kmer=locus.kmer))
Exemplo n.º 18
0
    def add(self, line):
        if self.is_normalized():
            tprint(
                'Error: Cannot add more data once data has been normalized.')
            return False

        line = line.strip().split()
        if line[0].lower() != self.locus().lower():
            tprint('Error: Invalid locus specified ' +
                   '(expected {0}, got {1}'.format(self.locus(), line[0]))
            return False

        # Each line is expected to be in the format of:
        # locus     k   normal  tumor

        k = int(line[1])
        self.__k.add(k)
        self.__normal[k] = int(line[2])
        self.__tumor[k] = int(line[3])
        self.up_to_date = False
        return True
Exemplo n.º 19
0
    def load_loci(self, bedfile):
        loci = []
        bedfile = os.path.abspath(bedfile)
        if not os.path.isfile(bedfile):
            tprint('MSILocusLoader error: File {0}'.format(bedfile) + 
                ' does not exist!')
        else:
            with open(bedfile, 'r') as filein:
                for line in filein.readlines():
                    if line[0] != '@':
                        locus = MSILocus(line)
                        if locus.chromosome[0:3] != 'chr':
                            # Force-prepend the chr prefix
                            locus.chromosome = 'chr{0}'.format(locus.chromosome)

                        # Correct any off-by-one errors that may occur because of 
                        # unstandardized open- and closed-endedness of bed file coordinates.
                        self.correct_off_by_one_errors(locus)                    
                        loci.append(locus)
            filein.close()
        return loci
Exemplo n.º 20
0
    def load_loci(self, bedfile):
        loci = []
        bedfile = os.path.abspath(bedfile)
        if not os.path.isfile(bedfile):
            tprint('MSILocusLoader error: File {0}'.format(bedfile) +
                   ' does not exist!')
        else:
            with open(bedfile, 'r') as filein:
                for line in filein.readlines():
                    if line[0] != '@':
                        locus = MSILocus(line)
                        if locus.chromosome[0:3] != 'chr':
                            # Force-prepend the chr prefix
                            locus.chromosome = 'chr{0}'.format(
                                locus.chromosome)

                        # Correct any off-by-one errors that may occur because of
                        # unstandardized open- and closed-endedness of bed file coordinates.
                        self.correct_off_by_one_errors(locus)
                        loci.append(locus)
            filein.close()
        return loci
Exemplo n.º 21
0
    def correct_off_by_one_errors(self, locus):
        # Generate the locus position by adding a 1 basepair padding around the locus
        # coordinates, which accounts for the off-by-one errors.
        position = '{0}:{1}-{2}'.format(locus.chromosome, locus.start - 1,
                                        locus.end + 1)

        raw_sequence = self.get_sequence(position)
        #accommodate newer versions of PySam, which causes this to return chromosome and position
        raw_sequence = raw_sequence.split(":")[-1]
        sequence = MSILocusLoader.strip_coord_re.sub("", raw_sequence)
        if sequence[1:1 + locus.kmer_length] != locus.kmer:
            # Sequence doesn't start where expected; shift accordingly.
            if sequence[0:locus.kmer_length] == locus.kmer:
                # Shift back by one
                locus.start -= 1
                locus.end -= 1
            elif sequence[2:2 + locus.kmer_length] == locus.kmer:
                # Shift forward by one
                locus.start += 1
                locus.end += 1
            else:
                tprint('Error: Specified locus does not appear ' +
                       ' to be the starting point for kmer {kmer}.'.format(
                           kmer=locus.kmer))
Exemplo n.º 22
0
def generate_config(args):
    config = {}

    config['genome'] = os.path.abspath(args.genome)
    if not os.path.isfile(config['genome']):
        tprint('Error: {0} does not exist!'.format(config['genome']))
        exit(1)

    config['threads'] = int(args.threads)
    if config['threads'] < 1:
        tprint('Error: Cannot specify less than one thread. ' +
               '(Provided {0}).'.format(config['threads']))
        exit(1)

    config['bedfile'] = os.path.abspath(args.bedfile)
    if args.bedfile is None:
        tprint('Error: BED file not provided!')
        exit(1)
    else:
        bedfile = os.path.abspath(args.bedfile)
        if not os.path.isfile(bedfile):
            tprint('Error: {0} does not exist!'.format(bedfile))
            exit(1)

    if args.normal is None:
        tprint('Error: Normal BAM/SAM file not provided!')
        exit(1)
    else:
        config['normal_filepath'] = os.path.abspath(args.normal)
        if not os.path.isfile(config['normal_filepath']):
            tprint('Error: {0} does not exist!'.format(
                config['normal_filepath']))
            exit(1)

    if args.tumor is None:
        tprint('Error: Tumor BAM/SAM file not provided!')
        exit(1)
    else:
        config['tumor_filepath'] = os.path.abspath(args.tumor)
        if not os.path.isfile(config['tumor_filepath']):
            tprint('Error: {0} does not exist!'.format(
                config['tumor_filepath']))
            exit(1)

    if args.output is None:
        tprint('Error: Output filepath must be specified!')
    else:
        config['output_filepath'] = os.path.abspath(args.output)
        # Make sure output folder exists
        output_dir = os.path.dirname(config['output_filepath'])
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

    config['min_read_quality'] = float(args.mrq)
    config['min_read_length'] = int(args.mrl)
    config['min_locus_quality'] = float(args.mlq)
    config['debug_output'] = args.debug_output
    return config
Exemplo n.º 23
0
    parser.add_argument('--difference-threshold', dest='dif_threshold', type=float,
        help='Default difference threshold value for calling a sample unstable.')

    parser.add_argument('--distance-threshold', dest='euc_threshold', type=float,
        help='Default distance threshold value for calling a sample unstable.')

    parser.add_argument('--dissimilarity-threshold', dest='cos_threshold', type=float,
        help='Default dissimilarity threshold value for calling a sample unstable.')


    args = parser.parse_args()

    input_filepath = os.path.abspath(args.input)
    if not os.path.isfile(input_filepath):
        tprint('Error! Input file {0} does not exist.'.format(input_filepath))
        exit(1)

    # Make sure default threshold values have been specified.
    thresholds = {}
    if args.dif_threshold is None:
        tprint('Error: Default difference threshold must be specified!')
        exit(1)
    else:
        thresholds['DIF'] = float(args.dif_threshold)

    if args.euc_threshold is None:
        tprint('Error: Default distance threshold must be specified!')
        exit(1)
    else:
        thresholds['EUC'] = float(args.euc_threshold)
Exemplo n.º 24
0
        return is_header
        # end .line_is_header()


    # end MANTIS_Filter class definition

"""
Attempts to filter out undesirable noise in the filter, in
an attempt to compare results with more confidence. The filter
will try to get rid of reads that are obvious outliers, loci that
don't meet a minimum coverage depth requirement, and repeat counts
that don't have enough supporting reads.
"""
if __name__ == "__main__":
    prog_name = 'MSI Locus Kmer Counter Filter'
    tprint(prog_name)

    parser = argparse.ArgumentParser(description=prog_name)

    parser.add_argument('-i', '--input', dest='input', type=str, required=True,
        help='Input file (.kmer_counts)')

    parser.add_argument('-o', '--output', dest='output', type=str, required=True,
        help='Output filename.')

    parser.add_argument('-mlc', '--min-locus-coverage', dest='mlc', type=int,
        default=20, help='Minimum coverage required for each of the normal ' +
        'and tumor results.')

    parser.add_argument('-mrr', '--min-repeat-reads', dest='mrr', type=int,
        default=5, help='Minimum reads supporting a specific repeat count.')
Exemplo n.º 25
0
    def process(self, input_filepath, msi_loci, config):
        self.__reset()

        # Generate dictionary read counts for loci
        counts = {}
        loci = []
        for locus in msi_loci:
            counts[locus.locus()] = {}
            loci.append(locus)

        # Generate input and output queues and (mutex) semaphores
        # for each.
        queue_out = Queue()
        queue_in = Queue()
        queue_full = BoundedSemaphore(100)
        full = Semaphore(0)
        empty = BoundedSemaphore(40)
        mutex_out = Semaphore(1)
        mutex_in = Semaphore(1)

        # Set amount of consumer threads; minimum one.
        consumer_threads = config['threads'] - 1
        if consumer_threads < 1:
            consumer_threads = 1

        # Create producer thread; currently only using single thread
        # since I/O is more of the limiter than CPU bound processes.
        self.__producer = Process(target=self.extract_reads,
                                  args=(input_filepath, msi_loci, full, empty,
                                        mutex_out, queue_in, consumer_threads))
        self.__producer.start()

        # Spawn the set amount of threads/processes
        if self.debug_output:
            tprint('Main> Generating {0} analyzer process(es).'.format(
                consumer_threads))
        for i in range(0, consumer_threads):
            p = Process(target=self.read_analyzer,
                        args=(queue_in, queue_out, full, empty, mutex_in,
                              mutex_out, queue_full))
            self.__consumers.append(p)
            self.__consumers[-1].start()

        # Iterate through the loci, fetching any reads and pushing them to
        # the pool of threads, collecting the output as they process it.
        query_delay = 0.050  # In seconds

        loop_counter = 0
        proc_check_interval = 100
        while (not queue_out.empty() or self.has_live_threads()):
            # Sleep for the set amount of time so the queue isn't constantly
            # getting hammered with queries
            time.sleep(query_delay)
            loop_counter += 1
            if loop_counter % proc_check_interval is 0:
                # Time to check that the consumers
                # didn't die while the producer is still producing
                mutex_out.acquire()
                self.status_check(queue_out.qsize())
                mutex_out.release()

            while not queue_out.empty():
                # There is data on the queue to be processed;
                # the return from the queue should be a tuple
                # with (locus, repeat_count)
                mutex_out.acquire()
                result = queue_out.get()
                locus = result[0]
                repeat_count = result[1]
                if repeat_count >= 0:
                    if locus not in counts:
                        counts[locus] = {}
                    if repeat_count not in counts[locus]:
                        counts[locus][repeat_count] = 0
                    counts[locus][repeat_count] += 1
                mutex_out.release()
                queue_full.release()

            if not self.has_live_threads():
                # All processes should have terminated.
                if self.debug_output:
                    tprint('Main> All processes complete.')
                break
        # end while loop

        return counts
Exemplo n.º 26
0
    def extract_reads(self, filename, loci, full, empty, mutex, queue,
                      consumers):
        use_chr_prefix = KmerRepeatCounter.bam_uses_chr_prefixes(
            filename, loci)
        if use_chr_prefix is None:
            tprint('Fatal error! Could not find any matched reads.')
            exit(1)

        if self.debug_output:
            tprint('Extractor> Thread starting for {0}'.format(filename))
        source = pysam.AlignmentFile(filename, 'rb')

        available_chromosomes = set([hash(str(x)) for x in source.references])
        for locus in loci:

            # Format the chromosome to be compatible with how the reads are
            # stored in the BAM file (i.e. with or without 'chr' prefix).
            chromosome = locus.chromosome
            if use_chr_prefix and chromosome[0:3] != 'chr':
                # Prepend the 'chr' prefix.
                chromosome = 'chr{0}'.format(chromosome)
            elif not use_chr_prefix and chromosome[0:3] == 'chr':
                # Remove the 'chr' prefix.
                chromosome = chromosome[3:]

            if hash(chromosome) in available_chromosomes:

                # Make sure the start coordinate isn't below 1
                start_pos = locus.start - 5
                if start_pos < 1:
                    start_pos = 1
                end_pos = locus.end + 5

                for read in source.fetch(chromosome, start_pos, end_pos):
                    # Use AlignedSegment object to create a list, which is
                    # then used in the creation of the SAMRead object,
                    # since the AlignedSegment objects are C-structs and cannot
                    # be passed to the consumer threads.
                    data = [
                        read.query_name, read.flag, chromosome,
                        read.reference_start, read.mapping_quality,
                        read.cigarstring, '', '', '', read.query_sequence,
                        KmerRepeatCounter.quality_scores_to_symbols(
                            read.query_qualities)
                    ]
                    # CIGAR of None means it was likely an asterisk (*), so the read
                    # will get ignored since something was wrong with the alignment.
                    if read.cigarstring is not None:
                        read = SAMRead('\t'.join([str(x) for x in data]))
                        item = [locus, read]

                        # Use semaphores to handle proper writing into the queue.
                        empty.acquire()
                        mutex.acquire()
                        queue.put(item)
                        mutex.release()
                        full.release()

        source.close()

        if self.debug_output:
            tprint('Extractor> Extracted all reads for all target loci.')
        # Add set amount of end signals to queue to end consumers
        for i in range(0, consumers * 2):
            empty.acquire()
            mutex.acquire()
            queue.put([False, False])
            mutex.release()
            full.release()
        if self.debug_output:
            tprint('Extractor> Queued up {0} termination signals.'.format(
                consumers))
        return True
Exemplo n.º 27
0
        
    except KeyboardInterrupt:
        tprint("User aborted reboot")

if __name__ == "__main__":
    """ Pings a given host just once every so often; and reboots machine if ping fails. """
    
    args = ArgumentParser(description="Pings a given host just once every so often; and reboots machine if ping fails")
    args.add_argument("--host", help="Host address to ping", required=True)
    args.add_argument("--time-to-wait", help="Time to wait between ping attempts", required=False)
    args = args.parse_args()
    
    if args.time_to_wait is not None:
        TIME_TO_WAIT = args.time_to_wait
        
    while True:
        try:
            check_output(['ping', '-c1', args.host])
            sleep(TIME_TO_WAIT)

        except CalledProcessError:
            tprint("Host is down, rebooting ..")
            do_reboot()
            break
            
        except KeyboardInterrupt:
            exit("User aborted script")
            
        except:
            exit("Something went wrong")
            
Exemplo n.º 28
0
def startRunning():
    """ Starts the autossh session. """
    
    try:
        check_output(CLI_START)
        tprint("Started autossh")

    except CalledProcessError as e:
        tprint("Error when trying to start autossh : %s" % e)
        
def checkIfRunning():
    """ Checks if an autossh process is currently running; starts one if not. """
    
    try:
        check_output(CLI_CHECK)

    except CalledProcessError:
        tprint("autossh is not running; restarting")
        startRunning()
        
if __name__ == "__main__":
    """ Checks if an autossh session is currently running; if not it will start one. """
    
    try:
        while True:
            checkIfRunning()
            sleep(60)

    except KeyboardInterrupt:
        tprint("User aborted script")
        
Exemplo n.º 29
0
        dest='euc_threshold',
        type=float,
        help='Default distance threshold value for calling a sample unstable.')

    parser.add_argument(
        '--dissimilarity-threshold',
        dest='cos_threshold',
        type=float,
        help=
        'Default dissimilarity threshold value for calling a sample unstable.')

    args = parser.parse_args()

    input_filepath = os.path.abspath(args.input)
    if not os.path.isfile(input_filepath):
        tprint('Error! Input file {0} does not exist.'.format(input_filepath))
        exit(1)

    # Make sure default threshold values have been specified.
    thresholds = {}
    if args.dif_threshold is None:
        tprint('Error: Default difference threshold must be specified!')
        exit(1)
    else:
        thresholds['DIF'] = float(args.dif_threshold)

    if args.euc_threshold is None:
        tprint('Error: Default distance threshold must be specified!')
        exit(1)
    else:
        thresholds['EUC'] = float(args.euc_threshold)
Exemplo n.º 30
0
"""


def generate_index_if_needed(filepath):
    index_file = os.path.abspath(filepath) + '.bai'
    if not os.path.isfile(index_file) and not os.path.isfile(
            os.path.abspath(filepath)[:-4] + '.bai'):
        # Index file doesn't exist; generate it
        pysam.index(filepath, index_file)
    return True
    # end .generate_index_if_needed()


if __name__ == "__main__":
    prog_name = 'MANTIS K-Mer Repeat Counter'
    tprint(prog_name)

    parser = argparse.ArgumentParser(description=prog_name)

    parser.add_argument('-n',
                        '--normal',
                        dest='normal',
                        type=str,
                        required=True,
                        help='Normal input (SAM/BAM) file.')

    parser.add_argument('-t',
                        '--tumor',
                        dest='tumor',
                        type=str,
                        required=True,
Exemplo n.º 31
0
        return is_header
        # end .line_is_header()

    # end MANTIS_Filter class definition


"""
Attempts to filter out undesirable noise in the filter, in
an attempt to compare results with more confidence. The filter
will try to get rid of reads that are obvious outliers, loci that
don't meet a minimum coverage depth requirement, and repeat counts
that don't have enough supporting reads.
"""
if __name__ == "__main__":
    prog_name = 'MSI Locus Kmer Counter Filter'
    tprint(prog_name)

    parser = argparse.ArgumentParser(description=prog_name)

    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        type=str,
                        required=True,
                        help='Input file (.kmer_counts)')

    parser.add_argument('-o',
                        '--output',
                        dest='output',
                        type=str,
                        required=True,
Exemplo n.º 32
0
    # end .generate_locus_output()

"""
Uses PySAM to generate index for BAM file if one doesn't exist.
"""
def generate_index_if_needed(filepath):
    index_file = os.path.abspath(filepath) + '.bai'
    if not os.path.isfile(index_file):
        # Index file doesn't exist; generate it
        pysam.index(filepath, index_file)
    return True
    # end .generate_index_if_needed()

if __name__ == "__main__":
    prog_name = 'MANTIS K-Mer Repeat Counter'
    tprint(prog_name)

    parser = argparse.ArgumentParser(description=prog_name)

    parser.add_argument('-n', '--normal', dest='normal', type=str, 
        required=True, help='Normal input (SAM/BAM) file.')

    parser.add_argument('-t', '--tumor', dest='tumor', type=str, 
        required=True, help='Tumor input (SAM/BAM) file.')

    parser.add_argument('-b', '--bedfile', dest='bedfile', type=str,
        required=True, help='Input BED file.')

    parser.add_argument('-o', '--output', dest='output', type=str, 
        help='Output BED filename.')
Exemplo n.º 33
0
def generate_config(args):
    config = {}

    config['genome'] = os.path.abspath(args.genome)
    if not os.path.isfile(config['genome']):
        tprint('Error: {0} does not exist!'.format(config['genome']))
        exit(1)

    config['threads'] = int(args.threads)
    if config['threads'] < 1:
        tprint('Error: Cannot specify less than one thread. '
            + '(Provided {0}).'.format(config['threads']))
        exit(1)

    config['bedfile'] = os.path.abspath(args.bedfile)
    if args.bedfile is None:
        tprint('Error: BED file not provided!')
        exit(1)
    else:
        bedfile = os.path.abspath(args.bedfile)
        if not os.path.isfile(bedfile):
            tprint('Error: {0} does not exist!'.format(bedfile))
            exit(1)

    if args.normal is None:
        tprint('Error: Normal BAM/SAM file not provided!')
        exit(1)
    else:
        config['normal_filepath'] = os.path.abspath(args.normal)
        if not os.path.isfile(config['normal_filepath']):
            tprint('Error: {0} does not exist!'.format(config['normal_filepath']))
            exit(1)
    
    if args.tumor is None:
        tprint('Error: Tumor BAM/SAM file not provided!')
        exit(1)
    else:
        config['tumor_filepath'] = os.path.abspath(args.tumor)
        if not os.path.isfile(config['tumor_filepath']):
            tprint('Error: {0} does not exist!'.format(config['tumor_filepath']))
            exit(1)


    if args.output is None:
        tprint('Error: Output filepath must be specified!')
    else:
        config['output_filepath'] = os.path.abspath(args.output)
        # Make sure output folder exists
        output_dir = os.path.dirname(config['output_filepath'])
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)


    config['min_read_quality'] = float(args.mrq)
    config['min_read_length'] = int(args.mrl)
    config['min_locus_quality'] = float(args.mlq)
    config['debug_output'] = args.debug_output
    return config
Exemplo n.º 34
0
 def __subset_check(subset):
     subset = subset.upper()
     if subset[0] not in ['N', 'T']:
         tprint('Locus() error: Please specify (N)ormal or (T)umor as subset')
         return False
     return True
Exemplo n.º 35
0
    def process(self, input_filepath, msi_loci, config):
        self.__reset()

        # Generate dictionary read counts for loci
        counts = {}
        loci = []
        for locus in msi_loci:
            counts[locus.locus()] = {}
            loci.append(locus)

        # Generate input and output queues and (mutex) semaphores
        # for each.
        queue_out = Queue()
        queue_in = Queue()
        queue_full = BoundedSemaphore(100)
        full = Semaphore(0)
        empty = BoundedSemaphore(40)
        mutex_out = Semaphore(1)
        mutex_in = Semaphore(1)

        # Set amount of consumer threads; minimum one.
        consumer_threads = config['threads'] - 1
        if consumer_threads < 1:
            consumer_threads = 1

        # Create producer thread; currently only using single thread
        # since I/O is more of the limiter than CPU bound processes.
        self.__producer = Process(target=self.extract_reads, args=(
            input_filepath, 
            msi_loci, 
            full, 
            empty, 
            mutex_out, 
            queue_in, 
            consumer_threads))
        self.__producer.start()


        # Spawn the set amount of threads/processes
        if self.debug_output:
            tprint('Main> Generating {0} analyzer process(es).'.format(consumer_threads))
        for i in range(0, consumer_threads):
            p = Process(target=self.read_analyzer, args=(
                queue_in, 
                queue_out, 
                full, 
                empty, 
                mutex_in, 
                mutex_out,
                queue_full))
            self.__consumers.append(p)
            self.__consumers[-1].start()

        # Iterate through the loci, fetching any reads and pushing them to 
        # the pool of threads, collecting the output as they process it.
        query_delay = 0.050 # In seconds
       
        loop_counter = 0
        proc_check_interval = 100
        while (not queue_out.empty() or self.has_live_threads()):
            # Sleep for the set amount of time so the queue isn't constantly
            # getting hammered with queries
            time.sleep(query_delay)
            loop_counter += 1
            if loop_counter % proc_check_interval is 0:
                # Time to check that the consumers
                # didn't die while the producer is still producing
                mutex_out.acquire()
                self.status_check(queue_out.qsize())
                mutex_out.release()

            while not queue_out.empty():
                # There is data on the queue to be processed;
                # the return from the queue should be a tuple
                # with (locus, repeat_count)
                mutex_out.acquire()
                result = queue_out.get()
                locus = result[0]
                repeat_count = result[1]
                if repeat_count >= 0:
                    if locus not in counts:
                        counts[locus] = {}
                    if repeat_count not in counts[locus]:
                        counts[locus][repeat_count] = 0
                    counts[locus][repeat_count] += 1
                mutex_out.release()
                queue_full.release()

            if not self.has_live_threads():
                # All processes should have terminated.
                if self.debug_output:
                    tprint('Main> All processes complete.')
                break
        # end while loop

        return counts
Exemplo n.º 36
0
    def extract_reads(self, filename, loci, full, empty, mutex, queue, consumers):
        use_chr_prefix = KmerRepeatCounter.bam_uses_chr_prefixes(filename, loci)
        if use_chr_prefix is None:
            tprint('Fatal error! Could not find any matched reads.')
            exit(1)

        if self.debug_output:
            tprint('Extractor> Thread starting for {0}'.format(filename))
        source = pysam.AlignmentFile(filename, 'rb')   

        available_chromosomes = set([hash(str(x)) for x in source.references])
        for locus in loci:

            # Format the chromosome to be compatible with how the reads are
            # stored in the BAM file (i.e. with or without 'chr' prefix).
            chromosome = locus.chromosome
            if use_chr_prefix and chromosome[0:3] != 'chr':
                # Prepend the 'chr' prefix.
                chromosome = 'chr{0}'.format(chromosome)
            elif not use_chr_prefix and chromosome[0:3] == 'chr':
                # Remove the 'chr' prefix.
                chromosome = chromosome[3:]

            if hash(chromosome) in available_chromosomes:

                # Make sure the start coordinate isn't below 1
                start_pos = locus.start - 5
                if start_pos < 1:
                    start_pos = 1
                end_pos = locus.end + 5


                for read in source.fetch(chromosome, start_pos, end_pos):
                    # Use AlignedSegment object to create a list, which is
                    # then used in the creation of the SAMRead object,
                    # since the AlignedSegment objects are C-structs and cannot
                    # be passed to the consumer threads.
                    data = [
                        read.query_name,
                        read.flag,
                        chromosome,
                        read.reference_start,
                        read.mapping_quality,
                        read.cigarstring,
                        '', 
                        '',
                        '',
                        read.query_sequence,
                        KmerRepeatCounter.quality_scores_to_symbols(read.query_qualities)
                    ]
                    # CIGAR of None means it was likely an asterisk (*), so the read
                    # will get ignored since something was wrong with the alignment.
                    if read.cigarstring is not None:
                        read = SAMRead('\t'.join([str(x) for x in data]))
                        item = [locus, read]

                        # Use semaphores to handle proper writing into the queue.
                        empty.acquire()
                        mutex.acquire()
                        queue.put(item)
                        mutex.release()
                        full.release()

        source.close()

        if self.debug_output:
            tprint('Extractor> Extracted all reads for all target loci.')
        # Add set amount of end signals to queue to end consumers
        for i in range(0, consumers * 2):
            empty.acquire()
            mutex.acquire()
            queue.put([False,False])
            mutex.release()
            full.release()
        if self.debug_output:
            tprint('Extractor> Queued up {0} termination signals.'.format(consumers))
        return True