Exemplo n.º 1
0
def concatSeq(genome_file, dir):
    """
    Concatenate separated CDS sequence fasta files located in dir into one file
    """
    util.checkDir(dir)
    if os.path.exists(genome_file):
        os.remove(genome_file)
    cmd = "cat %s/*.faa > %s" % (dir, genome_file)
    util.runProcess(cmd)
    logger.info("concatSeq finished")
Exemplo n.º 2
0
    def __init__(
        self, data, results, client=None, use_client=False, xcorr_append="", append="", create_symbolic_links=True
    ):
        # raise an Eception if __init__ is not called from a child
        if self.__class__ == Data:
            raise NotImplementedError("This function has to be called from or " "implemented by the daughter class.")

        self.data = data
        self.results = results
        self.raw = data + "/raw/%s_%d_%03d.mseed"
        self.getstr = self.x_prep = data + "/xcorr%s" % xcorr_append + "/prep/%s_%d_%03d"
        self.x_res = xcorr_results = results + "/xcorr%s" % xcorr_append

        self.xcorr = xcorr_results + "/xcorr/%s_%s%s_%s"  # period, correlation, filter, time ->1
        # self.x_filter = xcorr_results + '/filter/%s_%s%s_%s' # 1
        self.x_stack = xcorr_results + "/stack/%s_%s%s_stack%s"  # period, correlation, filter, number of stacks -> 2
        self.x_plot = xcorr_results + "/plots/%s_%s%s_%s"  # 1
        self.x_plot_stack = xcorr_results + "/plots_stack/%s_%s%s_stack%s"  # 2

        self.x_sac = xcorr_results + "/sac/%s_%s.SAC"

        # self.x_ev_prep = self.x_ev_getstr = (data + '/xcorr%s' % xcorr_append +
        #                                      '/prep/%d')
        # self.x_ev_corr = xcorr_results + '/xcorr/%s%s_%s' # correlation, filter, time ->1
        # self.x_stack = xcorr_results + '/stack/%s%s_stack%s' # correlation, filter, number of stacks -> 2
        # self.x_plot = xcorr_results + '/plots/%s%s_%s' # 1
        # self.x_plot_stack = xcorr_results + '/plots_stack/%s%s_stack%s' # 2

        #        self.x_day = xcorr_results + '/day/%s_day_%s'
        #        self.x_day_stack = xcorr_results + '/stack/%s_stack_%s'
        #        self.x_plot_day = xcorr_results + '/plots/%s_day_%s'
        #        self.x_plot_day_stack = xcorr_results + '/plots_stack/%s_stack_%s'
        #        self.x_hour = xcorr_results + '/hour/%s_hour_%d_%03d'

        self.rf_events = data + "/receiver/events/%s_%s" + append  # M5.5_events
        self.rf_results = results + "/receiver/results/%s" + append
        self.rf_sac = self.rf_results + "/sac/%s_%s.SAC"
        self.client = client
        self.use_client = use_client

        self.stations = self.eventfile = None
        if create_symbolic_links:
            util.checkDir(self.x_res + "/bla")
            try:
                util.checkDir(self.x_prep)
            except OSError:
                import warnings

                warnings.warn("Error with external HD")
            else:
                prepdir = os.path.dirname(self.x_prep)
                if not os.path.islink(prepdir + "/to_results"):
                    os.symlink(self.x_res, prepdir + "/to_results")
                    if not os.path.islink(self.x_res + "/to_prep"):
                        os.symlink(prepdir, self.x_res + "/to_prep")
Exemplo n.º 3
0
def topFastaHits(res_dir, extractedseq_dir):
    """
    Extract top fasta alignment hits that cover at least 80% of the length of 
    both sequences with at least 30% identity.
    Creates an in-house fasta sequence file for each hit
    Returns a dictionnary of hits
    """
    # Identity cutoff for reciprocal searches
    ident_cutoff  = 0.3;
    # Length of hit cutoff for reciprocal searches
    len_cutoff = 0.8;
    # Extracted sequence directory
    util.createDir(extractedseq_dir)
    # TODO Create MSP crunch file
    # Top hits dictionnary
    fastahits_dict = {}
    # Loop over the fasta results
    util.checkDir(res_dir)
    for (path, dirs, files) in os.walk(res_dir):
        for file in files:
            if not '.fa' in file:
                continue
            res_file = path + "/" + file
            logger.info("Reading... " +  res_file)
            # Read the fasta alignment results with biopython AlignIO fasta-m10     
            alignments = AlignIO.parse(open(res_file), "fasta-m10", seq_count=2)
            for alignment in alignments:
                # Select the hit based on cutoffs
                if float(alignment._annotations["sw_ident"]) < ident_cutoff:
                    continue
                record_query = alignment[0]
                record_match = alignment[1]
                overlap = float(alignment._annotations["sw_overlap"])
                if overlap/float(record_query.annotations["original_length"]) < len_cutoff and overlap/float(record_match.annotations["original_length"]) < len_cutoff:
                    continue
                # Create SeqRecord of selected hit
                extractedseq_record = SeqRecord(seq=Seq(str(record_match.seq).replace('-', '')), id=record_match.id, description=res_file)
                extractedseq_file = "%s/%s.faa" % (extractedseq_dir, record_match.id)
                # Print match sequence of selected hit into fasta file
                output_handle = open(extractedseq_file, "w")
                SeqIO.write([extractedseq_record], output_handle, "fasta")
                output_handle.close()
                logger.info("    ...sequence extracted into %s" % extractedseq_file)
                record_query_region = "%s-%s" % (record_query._al_start, record_query._al_stop)
                record_match_region = "%s-%s" % (record_match._al_start, record_match._al_stop)
                # add hit into dictionnary
                key = "%s||%s" % (record_query.id, record_match.id)
                # value in MSP crunch format
                value = "%s %s %s %s %s %s" % (alignment._annotations["sw_score"], alignment._annotations["sw_ident"], record_query_region, record_query.id, record_match_region, record_match.id)
                fastahits_dict[key] = value
    logger.info("Extract fasta alignment hits finished")
    return fastahits_dict
Exemplo n.º 4
0
    def writeX(self, stream, *args, **kwargs):
        """
        Write file for xcorr of 1 hour.

        The parameters are passed to getXHour()
        :param stream: stream to write
        :param st1: first station
        :param st2: second station
        :param time: UTCDateTime object (year, julday and hour properties)
        """
        filename = self.getX(*args, **kwargs)
        util.checkDir(filename)
        stream.write(filename, "Q")
Exemplo n.º 5
0
    def writeRFEvents(self, stream, station, time):
        """
        Write file with extracted traces around onsets of events.
        The filename is defined by completing self.rf_events with the arguments
        station and time.year.
        :param stream: stream to write
        :param station: station
        :param time: UTCDateTime object (only year property is used)
        """

        filename = self.rf_events % (station, time.year)
        util.checkDir(filename)
        stream.write(filename, 'Q')
Exemplo n.º 6
0
    def writeX(self, stream, *args, **kwargs):
        """
        Write file for xcorr of 1 hour.

        The parameters are passed to getXHour()
        :param stream: stream to write
        :param st1: first station
        :param st2: second station
        :param time: UTCDateTime object (year, julday and hour properties)
        """
        filename = self.getX(*args, **kwargs)
        util.checkDir(filename)
        stream.write(filename, 'Q')
Exemplo n.º 7
0
    def writeRFEvents(self, stream, station, time):
        """
        Write file with extracted traces around onsets of events.
        The filename is defined by completing self.rf_events with the arguments
        station and time.year.
        :param stream: stream to write
        :param station: station
        :param time: UTCDateTime object (only year property is used)
        """

        filename = self.rf_events % (station, time.year)
        util.checkDir(filename)
        stream.write(filename, "Q")
Exemplo n.º 8
0
def translateSeq(dir):
    """
    Translate nucleic acid sequence in fasta format into protein sequence using
    EMBOSS transeq
    
    Usage: transeq
    Online documentation: http://emboss.open-bio.org/wiki/Appdoc:Transeq
    
      Standard (Mandatory) qualifiers:
      [-sequence]          seqall     Nucleotide sequence(s) filename and optional
                                      format, or reference (input USA)
      [-outseq]            seqoutall  [.] Protein sequence
                                      set(s) filename and optional format (output
                                      USA)
      Additional (Optional) qualifiers:
       -table              menu       [0] Code to use (Values: 0 (Standard); 1
                                      (Standard (with alternative initiation
                                      codons)); 2 (Vertebrate Mitochondrial); 3
                                      (Yeast Mitochondrial); 4 (Mold, Protozoan,
                                      Coelenterate Mitochondrial and
                                      Mycoplasma/Spiroplasma); 5 (Invertebrate
                                      Mitochondrial); 6 (Ciliate Macronuclear and
                                      Dasycladacean); 9 (Echinoderm
                                      Mitochondrial); 10 (Euplotid Nuclear); 11
                                      (Bacterial); 12 (Alternative Yeast Nuclear);
                                      13 (Ascidian Mitochondrial); 14 (Flatworm
                                      Mitochondrial); 15 (Blepharisma
                                      Macronuclear); 16 (Chlorophycean
                                      Mitochondrial); 21 (Trematode
                                      Mitochondrial); 22 (Scenedesmus obliquus);
                                      23 (Thraustochytrium Mitochondrial))

      The basic USA syntax is one of:
        "file"
        "file:entry"
        "format::file"
        "format::file:entry"
        "database:entry"
        "database"
        "@file"
    """ 
    util.checkDir(dir)
    for file in os.listdir(dir):
        if '.ffn' in file:
            infasta = file
            outpep = file.split(".")[0] + ".faa"
            cmd = "transeq -sequence fasta::%s/%s -outseq fasta::%s/%s -table 11" % (dir, infasta, dir, outpep)
            util.runProcess(cmd)
    logger.info("Sequences translated.")
Exemplo n.º 9
0
def runReciprocalFasta(seq_dir, genome_file, fasta_dir):
    """
    Run FASTA between extracted in-house protein sequences against new genome 
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # Check new genome
    util.checkFile(genome_file)
    # Check ref genome extracted sequences
    util.checkDir(seq_dir)
    res_dir = fasta_dir
    if IS_LSF:
        # Rename new genome sequences for job array to be refgenome_1.faa refgenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'refgenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/refgenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on refgenome_${LSB_JOBINDEX}.faa against mygenome
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/refgenome_${LSB_JOBINDEX}.faa %s > %s/refgenome_${LSB_JOBINDEX}.fa" % (seq_dir, genome_file, res_dir)
        util.submitJobArray(jobname="genepy-recipfasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-recipfasta')
        logger.info("Reciprocal Fasta on LSF finished")
    else:
        # List of inhouse extracted genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s > %s/%s" % (seq_dir, seq_file, genome_file, res_dir, res_file)
            util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Reciprocal Fasta finished")
Exemplo n.º 10
0
def topReciprocalFastaHits(res_dir):
    """
    Extract top hits that cover at least 80% of the length of both sequences
    with at least 30% identity.
    Returns a dictionary of hits
    """
    # Identity cutoff for reciprocal searches
    ident_cutoff  = 0.3;
    # Length of hit cutoff for reciprocal searches
    len_cutoff = 0.8;
    # TODO Create MSP crunch file
    # Top hits dictionnary
    fastahits_dict = {}
    # Loop over the fasta results
    util.checkDir(res_dir)
    for (path, dirs, files) in os.walk(res_dir):
        for file in files:
            if not '.fa' in file:
                continue
            res_file = path + "/" + file
            logger.info("Reading... " +  res_file)
            # Read the fasta alignment results with biopython AlignIO fasta-m10     
            alignments = AlignIO.parse(open(res_file), "fasta-m10", seq_count=2)
            for alignment in alignments:
                # Select the hit based on cutoffs
                if float(alignment._annotations["sw_ident"]) < ident_cutoff:
                    continue
                record_query = alignment[0]
                record_match = alignment[1]
                overlap = float(alignment._annotations["sw_overlap"])
                if overlap/float(record_query.annotations["original_length"]) < len_cutoff and overlap/float(record_match.annotations["original_length"]) < len_cutoff:
                    continue
                
                record_query_region = "%s-%s" % (record_query._al_start, record_query._al_stop)
                record_match_region = "%s-%s" % (record_match._al_start, record_match._al_stop)
                # add hit into dictionnary
                key = "%s||%s" % (record_match.id, record_query.id) # inverted key to be comparable with fasta hits
                value = "%s %s %s %s %s %s" % (alignment._annotations["sw_score"], alignment._annotations["sw_ident"], record_query_region, record_query.id, record_match_region, record_match.id)
                fastahits_dict[key] = value
    logger.info("Extract reciprocal fasta alignment hits finished")
    return fastahits_dict
Exemplo n.º 11
0
def runHamapScan(seq_dir, hamap_dir):
    """
    HAMAP: High-quality Automated and Manual Annotation of microbial Proteomes
    ftp download site: ftp://ftp.expasy.org/databases/hamap/
     
    pfscan compares a protein or nucleic acid sequence against a profile 
    library. The result is an unsorted list of profile-sequence matches.
    download site: http://www.isrec.isb-sib.ch/ftp-server/pftools/pft2.3/
    """
    util.createDir(hamap_dir)
    util.checkDir(seq_dir)
    hamap_profile_file = "%s/hamap/hamap.prf" % os.path.dirname(__file__)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against hamap profile
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        cmd = "pfscan -klf %s/mygenome_${LSB_JOBINDEX}.faa %s > %s/mygenome_${LSB_JOBINDEX}.out" % (seq_dir, hamap_profile_file, hamap_dir)
        util.submitJobArray(jobname='genepy-hamap', jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-hamap')
        logger.info("HAMAP scan on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".out"
            cmd = "pfscan -klf %s/%s %s > %s/%s" % (seq_dir, seq_file, hamap_profile_file, hamap_dir, res_file)
            util.runProcess(cmd)
        logger.info("HAMAP scan finished")
Exemplo n.º 12
0
def runFasta(seq_dir, genomes_dir, fasta_dir):
    """
    Run FASTA on protein sequences between new genome against all in house genomes
    
    FASTA searches a protein or DNA sequence data bank
     version 35.04 Aug. 25, 2009
     W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
    """
    util.createDir(fasta_dir)
    # List of in-house genomes
    util.checkDir(genomes_dir)
    genome_files = []
    logger.info("Create fasta results directory for each in-house reference genome")
    for genome_file in os.listdir(genomes_dir):
        if '.faa' in genome_file:
            genome_files.append(genome_file)
            # Create fasta results directory for each in-house genome
            util.createDir("%s/%s" % (fasta_dir, genome_file.split(".")[0]))
            logger.info(genome_file)

    util.checkDir(seq_dir)
    if IS_LSF:
        # Rename new genome sequences for job array to be mygenome_1.faa mygenome_2.faa ...
        seq_num = 0
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            seq_num += 1
            if 'mygenome_' in seq_file and '.faa' in seq_file:
                continue
            seq_newfilepath = "%s/mygenome_%s.faa" % (seq_dir, seq_num)
            seq_filepath = "%s/%s" % (seq_dir, seq_file)
            os.rename(seq_filepath, seq_newfilepath)
        # Submit bsub job array on mygenome_${LSB_JOBINDEX}.faa against one refgenome at a time
        bsub_dir = "bsub"
        util.checkDir(bsub_dir)
        for genome_file in genome_files:
            res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
            cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/mygenome_${LSB_JOBINDEX}.faa %s/%s > %s/mygenome_${LSB_JOBINDEX}.fa" % (seq_dir, genomes_dir, genome_file, res_dir)
            util.submitJobArray(jobname="genepy-fasta", jobnum=seq_num, jobdir=bsub_dir, cmd=cmd)
        util.submitJobDependency('genepy-fasta')
        logger.info("Fasta on LSF finished")
    else:
        # List of new genome sequences
        for seq_file in os.listdir(seq_dir):
            if not '.faa' in seq_file:
                continue
            res_file = seq_file.split(".")[0] + ".fa"
            for genome_file in genome_files:
                res_dir = "%s/%s" % (fasta_dir, genome_file.split(".")[0])
                cmd = "fasta35 -z 1 -Q -H -S -m 10 %s/%s %s/%s > %s/%s" % (seq_dir, seq_file, genomes_dir, genome_file, res_dir, res_file)
                util.runProcess(cmd)
            logger.info(seq_file)
        logger.info("Fasta finished")
Exemplo n.º 13
0
    def __init__(self,
                 data,
                 results,
                 client=None,
                 use_client=False,
                 xcorr_append='',
                 append='',
                 create_symbolic_links=True):
        # raise an Eception if __init__ is not called from a child
        if self.__class__ == Data:
            raise NotImplementedError('This function has to be called from or '
                                      'implemented by the daughter class.')

        self.data = data
        self.results = results
        self.raw = data + '/raw/%s_%d_%03d.mseed'
        self.getstr = self.x_prep = (data + '/xcorr%s' % xcorr_append +
                                     '/prep/%s_%d_%03d')
        self.x_res = xcorr_results = results + '/xcorr%s' % xcorr_append

        self.xcorr = xcorr_results + '/xcorr/%s_%s%s_%s'  # period, correlation, filter, time ->1
        #self.x_filter = xcorr_results + '/filter/%s_%s%s_%s' # 1
        self.x_stack = xcorr_results + '/stack/%s_%s%s_stack%s'  # period, correlation, filter, number of stacks -> 2
        self.x_plot = xcorr_results + '/plots/%s_%s%s_%s'  # 1
        self.x_plot_stack = xcorr_results + '/plots_stack/%s_%s%s_stack%s'  # 2

        self.x_sac = xcorr_results + '/sac/%s_%s.SAC'

        #self.x_ev_prep = self.x_ev_getstr = (data + '/xcorr%s' % xcorr_append +
        #                                      '/prep/%d')
        #self.x_ev_corr = xcorr_results + '/xcorr/%s%s_%s' # correlation, filter, time ->1
        #self.x_stack = xcorr_results + '/stack/%s%s_stack%s' # correlation, filter, number of stacks -> 2
        #self.x_plot = xcorr_results + '/plots/%s%s_%s' # 1
        #self.x_plot_stack = xcorr_results + '/plots_stack/%s%s_stack%s' # 2

        #        self.x_day = xcorr_results + '/day/%s_day_%s'
        #        self.x_day_stack = xcorr_results + '/stack/%s_stack_%s'
        #        self.x_plot_day = xcorr_results + '/plots/%s_day_%s'
        #        self.x_plot_day_stack = xcorr_results + '/plots_stack/%s_stack_%s'
        #        self.x_hour = xcorr_results + '/hour/%s_hour_%d_%03d'

        self.rf_events = data + '/receiver/events/%s_%s' + append  # M5.5_events
        self.rf_results = results + '/receiver/results/%s' + append
        self.rf_sac = self.rf_results + '/sac/%s_%s.SAC'
        self.client = client
        self.use_client = use_client

        self.stations = self.eventfile = None
        if create_symbolic_links:
            util.checkDir(self.x_res + '/bla')
            try:
                util.checkDir(self.x_prep)
            except OSError:
                import warnings
                warnings.warn('Error with external HD')
            else:
                prepdir = os.path.dirname(self.x_prep)
                if not os.path.islink(prepdir + '/to_results'):
                    os.symlink(self.x_res, prepdir + '/to_results')
                    if not os.path.islink(self.x_res + '/to_prep'):
                        os.symlink(prepdir, self.x_res + '/to_prep')
Exemplo n.º 14
0
 def writeXEv(self, stream, *args, **kwargs):
     filename = self.getXEv(*args, **kwargs)
     util.checkDir(filename)
     stream.write(filename, 'Q')
Exemplo n.º 15
0
def main():
    # Fasta file extension: 
    # .ffn for the untranslated nucleotide sequences for each CDS; .faa for protein coding sequences (CDS)
    # .fa for the fasta alignment results
    # .fna for whole genomic DNA sequences; .frn for nucleotide sequences of RNA related features
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--dna", metavar="FILE", help="input dna FILE in fasta format", action="store", type="string", dest="dna")
    parser.add_option("-t", "--tab", metavar="FILE", help="input tab FILE in embl format", action="store", type="string", dest="tab")
    parser.add_option("-e", "--embl", metavar="FILE", help="input embl FILE with CDS features in embl format", action="store", type="string", dest="embl")
    parser.add_option("--genedb", help="extract reference genome protein sequences from geneDB", action="store_true", dest="db")
    parser.add_option("--fasta", help="run fasta against each extracted in-house genomes", action="store_true", dest="fasta")
    parser.add_option("--hamap", help="run pfscan against HAMAP profiles", action="store_true", dest="hamap")
    parser.add_option("--clean", help="delete all results without deleting reference genomes", action="store_true", dest="clean")
    parser.add_option("--deepclean", help="delete all reference genomes and results", action="store_true", dest="deepclean")
    (options, args) = parser.parse_args()
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " " 
    logger.debug(cmdline)
    
    # >>> ---------------------------------------------------------------------
    # >>> DATA PREPARATION
    # >>> ---------------------------------------------------------------------
    # List of needed software
    for softname in soft_lists:
        util.checkSoft(softname)
    # Prepare new genome data
    if options.dna and options.tab and not options.embl:
        util.checkFile(options.dna)
        mygenome_emblfile = fasta2embl(options.dna)
        mygenome_emblfile_withcds = concatFeatures(mygenome_emblfile, options.tab)
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        translateSeq(mygenome_dir)
    elif not options.dna and not options.tab and options.embl:
        mygenome_emblfile_withcds = options.embl
        splitSeq(mygenome_dir, mygenome_emblfile_withcds, "CDS")
        #splitSeqWithBiopython(mygenome_emblfile_withcds, "CDS") # does not work with testdata_01
        translateSeq(mygenome_dir)
    elif not options.deepclean:
        util.checkDir(mygenome_dir)
    # Extract in house genomes from chado db
    if options.db:
        chadoDump(refgenomes_dir)
    elif not options.deepclean:
        util.checkDir(refgenomes_dir)
    # bsub output directory
    if IS_LSF and not (options.clean or options.deepclean):
        util.createDir(bsub_dir)

    # >>> ---------------------------------------------------------------------
    # >>> ORTHOLOG SEARCH
    # >>> ---------------------------------------------------------------------
    # Run fasta & reciprocal fasta
    if options.fasta:
        runFasta(mygenome_dir, refgenomes_dir, fasta_dir)
        fasta_hits = topFastaHits(fasta_dir, refgenomes_extractedseq_dir)
        concatSeq(mygenome_fastafile_allcds, mygenome_dir)
        runReciprocalFasta(refgenomes_extractedseq_dir, mygenome_fastafile_allcds, reciprocalfasta_dir)
        reciprocalfasta_hits = topReciprocalFastaHits(reciprocalfasta_dir)
        printMSPCrunch(fasta_hits, reciprocalfasta_hits)
        hits = getHits(fasta_hits, reciprocalfasta_hits)
        logger.info("ORTHOLOGS")
        logger.info(hits['ortholog'])
        logger.info("SIMILARITY")
        logger.info(hits['similarity'])
        transferFeatures(hits['ortholog'])
    # Run hamap scan
    if options.hamap:
        runHamapScan(mygenome_dir, hamap_dir)

    # >>> ---------------------------------------------------------------------
    # >>> CLEANING OUTPUT DATA
    # >>> ---------------------------------------------------------------------
    # Clean results before a re-run
    if options.clean:
        # fasta results
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        # hamap results
        util.rmDir(hamap_dir)
        # bsub outputs
        if IS_LSF:
            util.rmDir(bsub_dir)
    # Deep clean - remove all
    if options.deepclean:
        util.rmDir(refgenomes_dir)
        util.rmDir(mygenome_dir)
        util.rmDir(fasta_dir)
        util.rmDir(reciprocalfasta_dir)
        util.rmDir(refgenomes_extractedseq_dir)
        util.rmFile(mygenome_fastafile_allcds)
        util.rmDir(hamap_dir)
Exemplo n.º 16
0
def PlotCompare(Config):
    rootfile = rt.TFile(Config.General['input'], 'READ')
    for cut in Config.CutList:
        lg.logging('Processing cut %s' % (cut))
        for var in Config.Vars[cut]:
            lg.logging('\tProcessing var %s' % (var))

            rt.gROOT.ProcessLine('SetAtlasStyle()')

            canvas = RC.HtbCompCanvas()
            canvas.canvas.Draw()
            canvas.pad1.Draw()
            canvas.pad2.Draw()
            canvas.pad1.cd()

            fHists = util.GetHists(Config, rootfile, cut, var)
            hStack = rt.THStack('hStack', 'hStack')
            h_tot = None
            g_tot = None

            xtitle = Config.fConfig[cut][var]['xname']
            ytitle = Config.fConfig[cut][var]['yname']

            h_data = None
            hasData = False

            if fHists.DATA:
                h_data = fHists.DATA
                hasData = True

            i_color = 2

            for _name, hist in fHists.STACK.items():
                if i_color == 5 or i_color == 8:
                    i_color += 1
                if i_color == 10:
                    i_color = 41
                hist.SetFillColor(i_color)
                hist.SetLineWidth(0)
                hist.SetLineColor(rt.kBlack)
                hStack.Add(hist)
                if h_tot is None:
                    h_tot = hist.Clone('allmc')
                else:
                    h_tot.Add(hist)
                i_color += 1

            b_ShowYields = Config.General['ShowYields']
            b_logy = False
            if 'logy' in Config.fConfig[cut][var]:
                b_logy = Config.fConfig[cut][var]['logy']
            rt.gStyle.SetEndErrorSize(4.0)
            h_dummy = h_tot.Clone('h_dummy')
            h_dummy.Scale(0)
            h_dummy.Draw('HIST')
            hStack.Draw('same HIST')

            g_tot = rt.TGraphAsymmErrors(h_tot)
            g_tot.SetFillStyle(3354)
            g_tot.SetFillColor(rt.kBlack)
            g_tot.SetLineColor(rt.kWhite)
            g_tot.SetLineWidth(0)
            g_tot.SetMarkerSize(0)
            g_tot.Draw('same E2')

            g_data = None
            if hasData:
                h_data.SetMarkerStyle(20)
                h_data.SetLineColor(rt.kBlack)
                h_data.SetLineWidth(2)
                h_data.SetMarkerSize(1.4)
                g_data = rt.TGraphAsymmErrors(h_data)
                g_data.SetMarkerSize(h_data.GetMarkerSize())
                g_data.SetMarkerColor(h_data.GetMarkerColor())
                g_data.SetMarkerStyle(h_data.GetMarkerStyle())
                g_data.SetLineWidth(h_data.GetLineWidth())
            else:
                h_data = h_tot.Clone('dummyData')
                h_data.SetTitle('Asimov Data')
                g_data = rt.TGraphAsymmErrors(h_data)

            if fHists.SINGLE:
                i_color = 2
                for _name, hist in fHists.SINGLE.items():
                    if i_color == 5:
                        i_color += 1
                    hist.SetLineColor(rt.TColor.GetColorBright(i_color))
                    hist.SetLineStyle(2)
                    hist.SetLineWidth(3)
                    ntotal = h_tot.Integral()
                    nhist = hist.Integral()
                    if nhist != 0:
                        hist.Scale(ntotal / nhist)
                    hist.Draw('same HIST')
                    i_color += 1

            if hasData:
                g_data.Draw('same Ep1')

            h_dummy.GetXaxis().SetTitle(xtitle)
            h_dummy.GetYaxis().SetTitle(ytitle)
#            h_dummy.GetYaxis().SetTitleOffset(2.3)
            if b_logy:
                h_dummy.SetMinimum(0.1)
            else:
                h_dummy.SetMinimum(0)
            if hasData:
                ymax = rt.TMath.Max(
                    h_tot.GetMaximum(),
                    h_data.GetMaximum()
                )
                if fHists.SINGLE:
                    for _name, hist in fHists.SINGLE.items():
                        if hist.Integral() != 0:
                            hist_max = hist.GetMaximum() * h_tot.Integral() / hist.Integral()
                            if ymax < hist_max:
                                ymax = hist_max
                if b_logy:
                    h_dummy.SetMaximum(800 * ymax)
                    canvas.pad1.SetLogy(True)
                else:
                    h_dummy.SetMaximum(1.5 * ymax)
            else:
                ymax = h_tot.GetMaximum()
                if not fHists.SINGLE == {}:
                    for _name, hist in fHists.SINGLE.items():
                        hist_max = hist.GetMaximum() * h_tot.Integral() / hist.Integral()
                        if ymax < hist_max:
                            ymax = hist_max
                if b_logy:
                    h_dummy.SetMaximum(500 * ymax)
                    canvas.pad1.SetLogy(True)
                else:
                    h_dummy.SetMaximum(1.5 * ymax)

            canvas.pad1.RedrawAxis()
            canvas.pad1.SetTickx()
            canvas.pad1.SetTicky()

            legX1 = 1 - 0.41 * (596.0 / canvas.pad1.GetWw()) - 0.08
            legX2 = 0.91
            legXmid = legX1 + 0.5 * (legX2 - legX1)

            if b_ShowYields:
                legXmid = legX1 + 0.6 * (legX2 - legX1)
                leg = rt.TLegend(legX1,
                                 0.93 - (
                                     len(fHists.STACK) +
                                     len(fHists.SINGLE) + 2) * 0.04,
                                 legXmid,
                                 0.93)
                leg1 = rt.TLegend(legXmid, leg.GetY1(), legX2, leg.GetY2())

                leg.SetFillStyle(0)
                leg.SetBorderSize(0)
                leg.SetTextAlign(32)
                leg.SetTextFont(rt.gStyle.GetTextFont())
                leg.SetTextSize(rt.gStyle.GetTextSize() * 0.6)
                leg.SetMargin(0.22)

                leg1.SetFillStyle(0)
                leg1.SetBorderSize(0)
                leg1.SetTextAlign(32)
                leg1.SetTextFont(rt.gStyle.GetTextFont())
                leg1.SetTextSize(rt.gStyle.GetTextSize() * 0.6)
                leg1.SetMargin(0.0)

                if hasData:
                    leg.AddEntry(h_data, 'DATA', 'lep')
                    leg1.AddEntry(
                        None,
                        str('%.1f' % (h_data.Integral())),
                        ''
                    )
                if fHists.SINGLE:
                    for _name, hist in fHists.SINGLE.items():
                        leg.AddEntry(hist, _name, 'f')
                        leg1.AddEntry(
                            None,
                            str('%.1f' % (hist.Integral())),
                            ''
                        )
                if fHists.STACK:
                    for _name, hist in fHists.STACK.items():
                        leg.AddEntry(hist, _name, 'f')
                        leg1.AddEntry(
                            None,
                            str('%.1f' % (hist.Integral())),
                            ''
                        )
                leg.AddEntry(None, 'Total', '')
                leg1.AddEntry(None, str('%.1f' % (h_tot.Integral())), '')
                leg.AddEntry(g_tot, 'Uncertainty', 'f')
                leg1.AddEntry(None, '  ', '')
                leg.Draw()
                leg1.Draw()
            else:
                leg = rt.TLegend(legX1,
                                 0.93 -
                                 (
                                     (len(fHists.STACK) +
                                      len(fHists.SINGLE) + 2) / 2
                                 ) * 0.06,
                                 legX2,
                                 0.93)
                leg.SetNColumns(2)
                leg.SetFillStyle(0)
                leg.SetBorderSize(0)
                leg.SetTextAlign(32)
                leg.SetTextFont(rt.gStyle.GetTextFont())
                leg.SetTextSize(rt.gStyle.GetTextSize() * 0.55)
                leg.SetMargin(0.22)

                if hasData:
                    leg.AddEntry(h_data, 'DATA', 'lep')
                if fHists.SINGLE:
                    for _name, hist in fHists.SINGLE.items():
                        leg.AddEntry(hist, _name, 'f')
                if fHists.STACK:
                    for _name, hist in fHists.STACK.items():
                        leg.AddEntry(hist, _name, 'f')
                leg.AddEntry(g_tot, 'Uncertainty', 'f')
                leg.Draw()

            for textObj in Config.Text[cut]:
                canvas.DrawText(textObj)
            sqrts = {"text": "#sqrt{s} = 13TeV", "xPos": 0.18, "yPos": 0.82, "size": 0.035, "color": 1}
            atlas = {"text": "#bf{#it{ATLAS}} Work in Progress", "xPos": 0.16, "yPos": 0.89, "size": 0.06, "color": 1}
            lumi = {"text": "#intLdt =" + Config.General['lumi'] + " pb^{-1}", "xPos": 0.32, "yPos": 0.82, "size": 0.035, "color": 1}
            canvas.DrawText(sqrts)
            canvas.DrawText(atlas)
            canvas.DrawText(lumi)
            canvas.pad2.cd()
            canvas.pad2.GetFrame().SetY1(2)
            h_dummy2 = h_tot.Clone('h_dummy2')
            h_dummy2.Scale(0)
            h_dummy2.Draw('HIST')
#            h_dummy2.GetYaxis().SetTitleOffset(
#                1.0 * h_dummy.GetYaxis().GetTitleOffset()
#            )
            h_ratio = h_data.Clone('h_ratio')
            h_tot_noerr = h_tot.Clone('h_tot_noerr')
            for i_bin in range(1, h_tot_noerr.GetNbinsX() + 1):
                h_tot_noerr.SetBinError(i_bin, 0)
            g_ratio2 = g_tot.Clone('g_ratio2')
            for i_bin in range(1, h_tot_noerr.GetNbinsX() + 1):
                if h_tot_noerr.GetBinContent(i_bin) == 0:
                    continue
                g_ratio2.SetPoint(
                    i_bin - 1,
                    g_ratio2.GetX()[i_bin - 1],
                    g_ratio2.GetY()[i_bin - 1] /
                    h_tot_noerr.GetBinContent(i_bin)
                )
                g_ratio2.SetPointEXlow(
                    i_bin - 1,
                    g_ratio2.GetEXlow()[i_bin - 1]
                )
                g_ratio2.SetPointEXhigh(
                    i_bin - 1,
                    g_ratio2.GetEXhigh()[i_bin - 1]
                )
                g_ratio2.SetPointEYlow(
                    i_bin - 1,
                    g_ratio2.GetEYlow()[i_bin - 1] /
                    h_tot_noerr.GetBinContent(i_bin)
                )
                g_ratio2.SetPointEYhigh(
                    i_bin - 1,
                    g_ratio2.GetEYhigh()[i_bin - 1] /
                    h_tot_noerr.GetBinContent(i_bin))

            h_dummy2.SetTitle('Data/MC')
            h_dummy2.GetYaxis().CenterTitle()
            h_dummy2.GetYaxis().SetTitle('Data/Bkg.')
#            h_dummy2.GetYaxis().SetLabelSize(
#                1.0 * h_ratio.GetYaxis().GetLabelSize()
#            )
            h_dummy2.GetYaxis().SetLabelOffset(0.02)
            h_dummy.GetYaxis().SetLabelOffset(0.02)
            h_dummy2.GetYaxis().SetNdivisions(504, False)
            rt.gStyle.SetEndErrorSize(4.0)
            canvas.pad1.SetTicky()

            h_ratio.Divide(h_tot_noerr)
            h_ratio.SetMarkerStyle(20)
            h_ratio.SetMarkerSize(1.4)
            h_ratio.SetMarkerColor(rt.kBlack)
            h_ratio.SetLineWidth(2)
            g_ratio = rt.TGraphAsymmErrors(h_ratio)
            g_ratio.SetMarkerStyle(h_ratio.GetMarkerStyle())
            g_ratio.SetMarkerSize(h_ratio.GetMarkerSize())
            g_ratio.SetMarkerColor(h_ratio.GetMarkerColor())
            g_ratio.SetLineWidth(h_ratio.GetLineWidth())
            g_ratio.SetLineColor(h_ratio.GetLineColor())
            g_ratio.SetLineStyle(h_ratio.GetLineStyle())

            hline = rt.TLine(
                h_dummy2.GetXaxis().GetXmin(),
                1,
                h_dummy2.GetXaxis().GetXmax(),
                1
                )
            hline.SetLineColor(rt.kRed)
            hline.SetLineWidth(2)
            hline.SetLineStyle(2)
            if hasData:
                g_ratio.Draw('Ep1 same')
            hline.Draw()

            h_dummy2.SetMinimum(0.5)
            h_dummy2.SetMaximum(1.5)
            h_dummy2.GetXaxis().SetTitle(h_dummy.GetXaxis().GetTitle())
#            h_dummy2.GetXaxis().SetTitleOffset(5.0)
            h_dummy.GetXaxis().SetTitle('')
            h_dummy.GetXaxis().SetLabelSize(0)

            labelsize = h_dummy.GetYaxis().GetLabelSize()
            titlesize = h_dummy.GetYaxis().GetTitleSize()
            titleoffset = h_dummy.GetYaxis().GetTitleOffset()

            h_dummy.GetYaxis().SetLabelSize(0.7 * labelsize)
            h_dummy2.GetYaxis().SetLabelSize(1.5 * labelsize)
            h_dummy.GetYaxis().SetTitleSize(0.75 * titlesize)
            h_dummy2.GetYaxis().SetTitleSize(1.7 * titlesize)
            h_dummy2.GetXaxis().SetTitleSize(2.0 * titlesize)
            h_dummy2.GetXaxis().SetLabelSize(1.7 * labelsize)
            h_dummy2.GetYaxis().SetTitleOffset(0.45 * titleoffset)
            h_dummy.GetYaxis().SetTitleOffset(1.1 * titleoffset)
            h_dummy2.GetXaxis().SetLabelOffset(0.02)

            g_ratio2.Draw('same E2')
            canvas.pad2.RedrawAxis()
            plotname = var + '.png'
            outDir = util.checkDir(Config.General['plotdir'])
            outDir = outDir + cut + '/'
            util.MakeDir(outDir)
            canvas.SavePrint(outDir + plotname)
            lg.logging('\t%s Done' % (var), 'SUCCESS')
            del canvas
        lg.logging('%s Done' % (cut), 'SUCCESS')
    rootfile.Close()
Exemplo n.º 17
0
 def writeXEv(self, stream, *args, **kwargs):
     filename = self.getXEv(*args, **kwargs)
     util.checkDir(filename)
     stream.write(filename, "Q")