예제 #1
0
    def closeFile(self, file):

        try:
            file.close()
        except IOError, exce:
            Log.log("MAFProcessor.closeFile : Enable to close file '" + file +
                    "'. From:\n\t--> " + str(exce))
예제 #2
0
    def createLogos(self, input_commstruct):

        db_file_path = []
        for index in range(len(self.dbFiles)):
            db_file_path.append(os.path.join(self.dbPath, self.dbFiles[index]))

        motif_name_list = input_commstruct.motifStatistics.keys()
        motif_definition = MotifUtils.getMotifsDefinitionFromTF(
            motif_name_list, db_file_path)
        logos_path = os.path.join(self.outPath,
                                  FinalOutputProcessor.LOGOS_DIR_NAME)
        FileUtils.createDirectory(logos_path)

        for motif_name in motif_name_list:
            if motif_name in motif_definition.keys():
                file_name = motif_name + ".tf"
                def_file_path = os.path.join(logos_path, file_name)
                def_file = open(def_file_path, "w")
                for line in motif_definition[motif_name]:
                    def_file.write(line)
                    def_file.flush
                def_file.close()
                RSATUtils.createLogoFromTF(logos_path, file_name, motif_name)
            else:
                Log.log(
                    "FinalOutputProcessor.createLogos : No definition found to create logo for motif : "
                    + motif_name)
예제 #3
0
    def toXMLFile( outpath, pipelines):
        
        pipelines_element = Element( PipelineXMLParser.PIPELINES_TAG)
        
        for pipeline in pipelines:
            pipeline_element = Element( PipelineXMLParser.PIPELINE_TAG)
            pipelines_element.append( pipeline_element)
            pipeline_element.attrib[ PipelineXMLParser.PIPELINE_NAME_ATT] = pipeline.name
            
            for component in pipeline.componentList:
                component_element = Element( PipelineXMLParser.COMPONENT_TAG)
                pipeline_element.append( component_element)
                component_element.attrib[ PipelineXMLParser.COMPONENT_PROCESSOR_ATT] = component.processorName
                for param_name, param_value in component.parameters.iteritems():
                    param_element = Element( PipelineXMLParser.PARAM_TAG)
                    component_element.append( param_element)
                    param_element.attrib[ PipelineXMLParser.PARAM_NAME_ATT] = str( param_name)
                    param_element.attrib[ PipelineXMLParser.PARAM_VALUE_ATT] = str( param_value)

        try:
            PipelineXMLParser.indent( pipelines_element, 0)
	    outfile = os.path.join( outpath, pipeline.name + ".xml")
            ElementTree( pipelines_element).write( outfile)
        except IOError, exce:
            Log.log( "PipelineXMLParser.toXMLFile : Unable to write Pipelines to XML file. From:\n\t---> " + str( exce))
예제 #4
0
 def fromXMLFile( input_filepath):
     
     try:
         return BedSeqAlignmentStatsCommStruct.getCommStructFromXML( input_filepath)
     except ParsingException, par_exce:
         Log.log( "BedSeqAlignmentStatsCommStruct.fromXMLFile : Unable to get CommStruct from XML file '" + input_filepath + "'. From:\n\t---> " + str( par_exce))
         return None
예제 #5
0
 def toXMLFile( self, output_filepath):
     
     try:
         root_element = self.convertCommStructToElementTree()
         self.indent( root_element,  0)
         ElementTree( root_element).write( output_filepath)
     except IOError, exce:
         Log.log( "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> " + str( exce))
예제 #6
0
    def addSequence(self, species, sequence):

        if species != None and sequence != None:
            self.sequences[species] = sequence
            seq_length = len(sequence)
            if self.totalLength == 0:
                self.totalLength = seq_length
            else:
                if seq_length != self.totalLength:
                    Log.log(
                        "SequenceAlignment.addSequence : Added sequence does not have the right lenght for this alignment : Alignement length = "
                        + str(self.totalLength) + " DNA sequence length = " +
                        str(seq_length))
                    for fix_index in range(self.totalLength - len(sequence)):
                        sequence.append(Constants.SEQUENCE_INSERTION_CHAR)
예제 #7
0
    def outputClassification(self, input_commstruct, analysis, limit_value,
                             parameter_dic):

        try:
            # Create and write to file the XML element
            root_element = self.toXML(input_commstruct, analysis, limit_value,
                                      parameter_dic)
            self.indent(root_element, 0)
            # Output the XML to file
            doc = ET.ElementTree(root_element)
            classification_file_path = os.path.join(
                self.outPath,
                self.component.pipelineName + "_MotifClassification.xml")
            outfile = open(classification_file_path, 'w')
            outfile.write('<?xml version="1.0" encoding="utf-8"?>\n')
            outfile.write(
                '<?xml-stylesheet type="text/xsl" href="classification.xsl"?>\n'
            )
            doc.write(outfile)
            outfile.close()
            # Copy the XSL file in the same directory than the XML
            shutil.copy(
                os.path.join(
                    self.component.getParameter(Constants.INSTALL_DIR_PARAM),
                    "resources/xsl/classification/classification.xsl"),
                self.outPath)
            shutil.copy(
                os.path.join(
                    self.component.getParameter(Constants.INSTALL_DIR_PARAM),
                    "resources/xsl/classification/RSAT_menu.js"), self.outPath)
            shutil.copy(
                os.path.join(
                    self.component.getParameter(Constants.INSTALL_DIR_PARAM),
                    "resources/xsl/classification/jquery.dataTables.js"),
                self.outPath)
            shutil.copy(
                os.path.join(
                    self.component.getParameter(Constants.INSTALL_DIR_PARAM),
                    "resources/xsl/classification/results.css"), self.outPath)
            shutil.copy(
                os.path.join(
                    self.component.getParameter(Constants.INSTALL_DIR_PARAM),
                    "resources/xsl/classification/peak-footprints.css"),
                self.outPath)
        except IOError, exce:
            Log.log(
                "ClassificationProcessor.outputClassification : Unable to write classification to XML file. From:\n\t---> "
                + str(exce))
예제 #8
0
    def getMotifsDetailsFromJaspar():

        matrix_path = os.path.join( MotifUtils.JASPAR_FLAT_DB_PATH, "MATRIX.txt")
        matrix_annotation_path = os.path.join( MotifUtils.JASPAR_FLAT_DB_PATH, "MATRIX_ANNOTATION.txt")

        names = {}
        id = {}
        family = {}
        type = {}
        classe = {}
        
        try:
            matrix_file = open( matrix_path, "r")
            matrix_annotation_file = open( matrix_annotation_path,  "r")
            
            for line in matrix_file:
                tokens = line.split()
                if len( tokens) >= 5:
                    current_num = tokens[ 0]
                    current_name = tokens[ 2] + "." + tokens[3]
                    current_id = "".join( tokens[ 4:])
                    names[ current_num] = current_name
                    id[ current_name] = current_id
                else:
                    raise ParsingException( "MotifUtils.getMotifsDetailsFromJaspar : Matrix file is not correctly formatted: 5 columns required while " + str( len( tokens)) + " columns are found")
            
            for line in matrix_annotation_file:
                tokens = line.split()
                current_num = tokens[ 0]
                if current_num in names.keys():
                    current_key = tokens[ 1]
                    current_value = "".join( tokens[2:])
                    if current_key == "family":
                        family[ names[ current_num]] = current_value
                    elif current_key == "class":
                        classe[ names[ current_num]] = current_value
                    elif  current_key == "type":
                        type[ names[ current_num]] = current_value
                else:
                    Log.log( "MotifUtils.getMotifsDetailsFromJaspar : Motif number was not detected in matrix file : " + current_num)
            matrix_annotation_file.close()
            matrix_file.close()
        except (IOError, ParsingException),  exce:
            Log.log( "MotifUtils.getMotifsDetailsFromJaspar : unable to read motifs definition. From:\n\t---> " + str( exce))
예제 #9
0
    def fixIndex(self, text_index):

        if text_index < 0:
            return text_index

        if self.referenceSpecies in self.sequences.keys():
            limit = min(text_index + 1,
                        len(self.sequences[self.referenceSpecies]))
            count = 0
            for index in range(limit):
                if self.sequences[self.referenceSpecies][
                        index] == Constants.SEQUENCE_INSERTION_CHAR:
                    count += 1
            return text_index - count
        else:
            Log.log(
                "SequenceAlignement.fixIndex : Reference species is not set for Sequence Alignement : "
                + self.name)
            return text_index
예제 #10
0
    def getBEDSequenceDictionnary(species, bed_filepath, extension_5p,
                                  extension_3p):

        sequence_dic = {}

        try:
            input_file = open(bed_filepath)
            for line in input_file:
                tokens = line.split()
                if len(tokens) > BEDParser._endindex_col:
                    chrom = tokens[BEDParser._chrom_col].lower()
                    #if chrom[ 0:3] == "chr":
                    if chrom[0:1] != "#":
                        if len(chrom) < 4:
                            chrom = "chr" + chrom
                        start = BEDParser.getTokenAsint(
                            tokens[BEDParser._startindex_col])
                        end = BEDParser.getTokenAsint(
                            tokens[BEDParser._endindex_col])
                        if start < end:
                            start = start - extension_5p
                            if start < 0:
                                start = 0
                            end = end + extension_3p
                            bedsequence = BEDSequence(species, chrom, start,
                                                      end)
                            if len(tokens) > BEDParser._id_col:
                                bedsequence.id = tokens[BEDParser._id_col]
                            bedsequence_key = bedsequence.getKey()
                            if not sequence_dic.has_key(bedsequence_key):
                                sequence_dic[bedsequence_key] = []
                            sequence_dic[bedsequence_key].append(bedsequence)
                        else:
                            Log.log(
                                "BEDParser.getBEDSequenceDictionnary : A sequence has inversed start and end coordinates : "
                                + line)
                else:
                    Log.log("No 'chr' in line :" + line)
        except ParsingException, par_exce:
            raise ParsingException(
                "BEDParser.getBEDSequenceDictionnary : Some attributes are mor numbers. From:\n\t-->  "
                + str(par_exce))
    def getMotifStatistics(node_motif, motif):

        statistics = MotifStatistics()

        for node_param in node_motif:
            if node_param.tag.lower() == MotifStatisticsCommStruct.PARAM_TAG:
                param_name = MotifStatisticsCommStruct.getAttribute(
                    node_param, MotifStatisticsCommStruct.PARAM_NAME_ATT,
                    False)
                param_value = MotifStatisticsCommStruct.getAttribute(
                    node_param, MotifStatisticsCommStruct.PARAM_VALUE_ATT,
                    False)
                if param_name != None and len(param_name) > 0:
                    if param_value != None and len(param_value) > 0:
                        if param_name == MotifStatisticsCommStruct.CHI2_PARAM_NAME:
                            statistics.chi2 = MotifStatisticsCommStruct.getTokenAsfloat(
                                param_value, False)
                        elif param_name == MotifStatisticsCommStruct.HISTOGRAM_GRAPH_PATH_PARAM_NAME:
                            statistics.histogramGraphPath = param_value
                        elif param_name == MotifStatisticsCommStruct.HISTOGRAM_PARAM_NAME:
                            statistics.histogram = param_value.split(
                                MotifStatisticsCommStruct.
                                HISTOGRAM_ENTRY_SEPARATOR_CHAR)
                        elif param_name == MotifStatisticsCommStruct.NULL_HISTOGRAM_PARAM_NAME:
                            statistics.nullHistogram = param_value.split(
                                MotifStatisticsCommStruct.
                                HISTOGRAM_ENTRY_SEPARATOR_CHAR)
                        else:
                            Log.log(
                                "MotifStatisticsCommStruct.getMotifAttributes : Unknown attribute name : "
                                + param_name)
                    else:
                        raise ParsingException(
                            "MotifStatisticsCommStruct.getMotifAttributes : Malformed parameter - unable to retrieve parameter value in motif '"
                            + motif.name + "'")
                else:
                    raise ParsingException(
                        "MotifStatisticsCommStruct.getMotifAttributes : Malformed parameter - unable to retrieve parameter name in motif '"
                        + motif.name + "'")

        return statistics
예제 #12
0
    def generateRandomSites(self, motif, motif_file_path, site_number):

        # Retrieve method required parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        output_path = os.path.join(dir_path, motif + "_sites.fasta")

        # Execute the RSAT random-seq command
        cmd = os.path.join(RSAT_PATH, "python-scripts/random-sites")
        cmd += " -m " + motif_file_path
        cmd += " -n " + str(site_number)
        cmd += " -o " + output_path

        # Execute the command
        cmd_result = commands.getstatusoutput(cmd)
        if cmd_result[0] != 0:
            Log.log(
                "ImplantSitesProcessor.generateSites : status returned is :" +
                str(cmd_result[0]) + " for command '" + cmd + "'")
            Log.log(
                "ImplantSitesProcessor.generateSites : command output is = \n"
                + str(cmd_result[1]))
            raise ExecutionException(
                "ImplantSitesProcessor.generateSites : Cannot execute random-sites commands. See logs for more details"
            )

        # Parse the result of the command
        sites = []
        try:
            site_file = open(output_path, "r")
            for line in site_file:
                if not line.isspace() and line[0] != ">":
                    sites.append(line.split()[0].upper())
            site_file.close()
        except IOError, io_exce:
            raise ExecutionException(
                "ImplantSitesProcessor.generateSites : Unable to read motif sites from file '"
                + output_path + "'. From:\n\t---> " + str(io_exce))
class MotifStatisticsCommStruct(CommStruct):

    # --------------------------------------------------------------------------------------
    def __init__(self):

        CommStruct.__init__(self)
        self.motifList = []
        self.motifToStatistics = {}

    # --------------------------------------------------------------------------------------
    def addMotif(self, motif):

        if motif != None:
            self.motifList.append(motif)

    # --------------------------------------------------------------------------------------
    def addMotifStatistics(self, motif, statistics):

        if motif != None and statistics != None:
            self.motifToStatistics[motif] = statistics

    # --------------------------------------------------------------------------------------
    def toXMLFile(self, output_filepath):

        try:
            root_element = self.convertCommStructToElementTree()
            self.indent(root_element, 0)
            ElementTree(root_element).write(output_filepath)
        except IOError, exce:
            Log.log(
                "MotifStatisticsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> "
                + str(exce))
        except ParsingException, par_exce:
            Log.log(
                "MotifStatisticsCommStruct.toXMLFile : Unable to save CommStruct to XML file. From:\n\t---> "
                + str(par_exce))
예제 #14
0
    def outputProgression(pipeline):

        try:
            # create the pipeline element and set its attributes
            pipeline_element = Element(ProgressionManager.PIPELINE_TAG)
            pipeline_element.attrib[
                ProgressionManager.NAME_ATT] = pipeline.name
            pipeline_prog = ProgressionManager.instance.pipelinesProgressions[
                pipeline]
            pipeline_element.attrib[
                ProgressionManager.STATUS_ATT] = pipeline_prog.status
            pipeline_element.attrib[
                ProgressionManager.START_TIME_ATT] = time.strftime(
                    "%b %d %Y %H:%M:%S",
                    time.localtime(pipeline_prog.startTime))
            if pipeline_prog.status == ProgressionManager.RUNNING_STATUS or pipeline_prog.status == ProgressionManager.NOT_STARTED_STATUS:
                pipeline_element.attrib[ProgressionManager.END_TIME_ATT] = "0"
            else:
                pipeline_element.attrib[
                    ProgressionManager.END_TIME_ATT] = time.strftime(
                        "%b %d %Y %H:%M:%S",
                        time.localtime(pipeline_prog.endTime))
            pipeline_elapsed_time = pipeline_prog.getElapsedTime()
            if pipeline_elapsed_time > 0:
                pipeline_element.attrib[
                    ProgressionManager.ELAPSED_TIME_ATT] = str(
                        pipeline_elapsed_time)

            # Parse the component list to create the component element and set their attributes
            for component_prog in pipeline_prog.componentProgressions:
                component_element = Element(ProgressionManager.COMPONENT_TAG)
                pipeline_element.append(component_element)
                component_element.attrib[
                    ProgressionManager.
                    NAME_ATT] = component_prog.component.processorShortName
                component_element.attrib[
                    ProgressionManager.
                    DISPLAY_NAME_ATT] = component_prog.component.processorDisplayName
                component_element.attrib[
                    ProgressionManager.
                    BRANCH_ATT] = component_prog.component.branch
                component_element.attrib[
                    ProgressionManager.
                    RANK_ATT] = component_prog.component.rank
                component_element.attrib[
                    ProgressionManager.STATUS_ATT] = component_prog.status
                component_elapsed_time = component_prog.getElapsedTime()
                if component_elapsed_time >= 0:
                    component_element.attrib[
                        ProgressionManager.
                        ELAPSED_TIME_ATT] = ProgressionManager.convertTime(
                            component_elapsed_time)
                if component_prog.status == ProgressionManager.RUNNING_STATUS:
                    # If component is running, look for tasks to create corresponding elements and attributes
                    if len(component_prog.tasks) > 0:
                        for task in component_prog.tasks:
                            task_element = Element(ProgressionManager.TASK_TAG)
                            component_element.append(task_element)
                            task_element.attrib[
                                ProgressionManager.NAME_ATT] = task
                            task_element.attrib[
                                ProgressionManager.
                                PROGRESSION_VALUE_ATT] = str(
                                    int(
                                        math.ceil(component_prog.
                                                  taskProgression[task] *
                                                  1000.0)) / float(10)) + "%"
                    else:
                        # If no task exists, set the progrssion attribute at the component level
                        component_element.attrib[
                            ProgressionManager.PROGRESSION_VALUE_ATT] = str(
                                int(
                                    math.ceil(component_prog.getProgression() *
                                              1000.0)) / float(10)) + "%"
                # If component is not running and is not 'not started', set the output put result
                elif component_prog.status != ProgressionManager.NOT_STARTED_STATUS:
                    component_element.attrib[
                        ProgressionManager.
                        RESULT_ATT] = component_prog.component.getOutputFilePath(
                        )

            ProgressionManager.indent(pipeline_element, 0)
            doc = ET.ElementTree(pipeline_element)
            pipeline_output_dir = os.path.join(
                ProgressionManager.instance.outputPath, pipeline.name)
            progression_file = os.path.join(pipeline_output_dir,
                                            Constants.PROGRESSION_XML_FILE)
            #ElementTree( pipeline_element).write( progression_file)
            outfile = FileUtils.openFile(progression_file, 'w')
            outfile.write('<?xml version="1.0" encoding="utf-8"?>\n')
            outfile.write('<?xml-stylesheet type="text/xsl" href="' +
                          ProgressionManager.instance.stylesheetPath + '"?>\n')
            doc.write(outfile)
            outfile.close()

        except IOError, exce:
            Log.log(
                "ProgressionManager.outputProgression : Unable to write progresssion to XML file. From:\n\t---> "
                + str(exce))
    def compareMotifs(self, reference_motifs, identified_motifs):

        # Retrieve required parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)

        #Prepare outputdir
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        shutil.rmtree(dir_path, True)
        os.mkdir(dir_path)
        old_working_dir = os.getcwd()
        os.chdir(dir_path)

        # Establish the progression
        total_length = len(reference_motifs) * len(identified_motifs.keys())
        ProgressionManager.setTaskProgression("Comparing motifs",
                                              self.component, 0.0)

        progress = 0
        for reference_motif in reference_motifs:
            ref_file_info = self.outputMotifToTransfacFile(
                reference_motif, dir_path)
            for identified_motif_name in identified_motifs.keys():
                progress += 1
                if reference_motif.name != identified_motif_name:
                    count = 0
                    identified_motif_list = identified_motifs[
                        identified_motif_name]
                    for identified_motif in identified_motif_list:
                        count += 1
                        ident_file_info = self.outputMotifToTransfacFile(
                            identified_motif, dir_path)

                        # Compose the compare-matrices command line with all required options
                        cmd = os.path.join(RSAT_PATH,
                                           "perl-scripts/compare-matrices")
                        cmd += " -file1 " + ref_file_info[1] + " -format1 tf"
                        cmd += " -file2 " + ident_file_info[1] + " -format2 tf"
                        cmd += " -mode matches"
                        cmd += " -return all"
                        if len(identified_motif_list) > 1:
                            cmd += " -o " + reference_motif.name + "_" + identified_motif_name + "_" + count
                            count += 1
                        else:
                            cmd += " -o " + reference_motif.name + "_" + identified_motif_name

                        # Execute the command
                        cmd_result = commands.getstatusoutput(cmd)
                        if cmd_result[0] != 0:
                            Log.log(
                                "CompareIdentifiedMotifsProcessor.compareMotifs : status returned is :"
                                + str(cmd_result[0]) + " for command '" + cmd +
                                "'")
                            Log.log(
                                "CompareIdentifiedMotifsProcessor.compareMotifs : command output is = \n"
                                + str(cmd_result[1]))
                            continue

                if progress % 10 == 0:
                    ProgressionManager.setTaskProgression(
                        "Identifying motifs", self.component,
                        progress / float(total_length))

        # returns to initial working dir
        os.chdir(old_working_dir)
예제 #16
0
    def executeClustalW(self, input_commstruct):

        #Retrieve the method parameters

        desired_species_line = self.getParameter(
            MSAProcessor.DESIRED_SPECIES_LIST_PARAM, False)
        if desired_species_line != None:
            desired_species_list = desired_species_line.split()
        else:
            desired_species_list = []

        command_options_line = self.getParameter(
            MSAProcessor.COMMAND_OPTIONS_PARAM, False)
        if command_options_line == None:
            command_options = ""
        else:
            command_options = command_options_line

        # Prepare the outputdir for FASTA file export
        file_info = self.prepareOutputDir()
        dir_path = file_info[0]
        file_name = file_info[1]
        file_path = os.path.join(dir_path, file_name + ".fasta")

        # Change directory to output dir
        working_dir = os.getcwd()
        os.chdir(dir_path)

        command = self.component.getParameter(Constants.CLUSTALW_COMMAND_PARAM)

        # Compose the ClustalW command line with all required options
        output_filepath = file_path + "result.txt"
        cmd = command
        cmd += " -INFILE=" + file_path
        cmd += " -ALIGN"
        cmd += " -TYPE=DNA"
        cmd += " -OUTFILE=" + output_filepath
        cmd += " " + command_options

        for bed_sequence in input_commstruct.bedToMA.keys():
            final_result = []
            for alignment in input_commstruct.bedToMA[bed_sequence]:
                #output the alignment to FASTA file
                self.outputAlignmentToFASTAFile(alignment, file_path,
                                                desired_species_list)
                # Execute the command
                cmd_result = commands.getstatusoutput(cmd)
                if cmd_result[0] != 0:
                    Log.log(
                        "MSAProcessor.executeClustalW : status returned is :" +
                        str(cmd_result[0]) + " for command '" + cmd + "'")
                    Log.log(
                        "MSAProcessor.executeClustalW : command output is = \n"
                        + str(cmd_result[1]))
                    continue

                # Parse the result of the compare-matrices command to get the result list
                final_result.append(
                    self.parseClustalWResult(output_filepath,
                                             desired_species_list))
            if final_result != None:
                input_commstruct.bedToMA[bed_sequence] = final_result

        # Change dir to previous working dir
        os.chdir(working_dir)
예제 #17
0
    def buildHistogramsAndGraphs(self, input_commstruct, histogram_interval):

        # Retrieve the algorithm parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)

        # Compute the statistics of the motifs
        Log.info(
            "HistogramProcessor.buildHistogramsAndGraphs : collecting motifs statistics"
        )
        statistics = self.computeMotifStatistics(input_commstruct, )

        hits_distances = statistics[0]
        motif_size_min = statistics[1]
        motif_size_max = statistics[2]
        hits_peakscore = statistics[3]

        #print "motif_size_max = " + str( motif_size_max)

        # Compute the uniform distribution probabilities
        Log.info(
            "HistogramProcessor.buildHistogramsAndGraphs : computing uniform distribution"
        )
        uniform_distributions = self.computeUniformDistributions(
            input_commstruct, histogram_interval, motif_size_min,
            motif_size_max)

        # Build the output CommStruct
        Log.info(
            "HistogramProcessor.buildHistogramsAndGraphs : building histogram and graphs"
        )

        # Execute the RSAT commands and computations
        try:
            # Prepare the output directories
            dir_path = os.path.join(self.component.outputDir,
                                    self.component.getComponentPrefix())
            shutil.rmtree(dir_path, True)
            FileUtils.createDirectory(dir_path, 0777)

            # Parse the motif list and execute the computations and commands for each of them
            ProgressionManager.setTaskProgression("Building motifs histogram",
                                                  self.component, 0.0)
            total_motif_number = len(hits_distances.keys())
            count_motif = 0
            for motif_name in hits_distances.keys():
                count_motif += 1
                motif_stats = input_commstruct.motifStatistics[motif_name]
                motif_id = motif_stats.motifID
                motif_size = motif_stats.motifSize
                hit_number = motif_stats.getAttributeAsint(
                    MotifStatistics.MOTIF_HIT_SCORE)

                # Initialize the motif prefix ID
                #if motif_id != None and len( motif_id) > 0:
                #    prefix_id = "_" + motif_id
                #else:
                #    prefix_id = ""
                prefix_id = ""

                # save the stats to a tabbed file for classfreq command
                input_path = os.path.join(
                    dir_path, motif_name + prefix_id + "_Distances.tab")
                self.outputMotifStatistics(hits_distances[motif_name],
                                           input_path)

                # execute the classfreq command
                histo_path = os.path.join(
                    dir_path,
                    motif_name + prefix_id + "_Distances_histogram.tab")

                cmd = os.path.join(RSAT_PATH, "perl-scripts/classfreq")
                cmd += " -i '" + input_path + "'"
                cmd += " -col 1"
                cmd += " -ci " + str(histogram_interval)
                cmd += " -o '" + histo_path + "'"

                cmd_result = commands.getstatusoutput(cmd)
                if cmd_result[0] != 0:
                    Log.log(
                        "HistogramProcessor.buildHistogramsAndGraphs : status returned is :"
                        + str(cmd_result[0]) + " for command '" + cmd + "'")
                    Log.log("  Command output is = \n" + str(cmd_result[1]))
                    continue

                # retrieve the classfreq results from output file
                motif_distribution = self.parseClassfreqResults(histo_path)

                # compute the homogen distribution for the current motif
                null_distribution = self.computeMotifNullDistribution(
                    uniform_distributions[motif_size], hit_number)

                # Save both histograms to same file to create a common graph
                all_histo_path = os.path.join(
                    dir_path,
                    motif_name + prefix_id + "_Distances_histograms.tab")
                label1 = motif_name
                label2 = "Homogeneous model"
                self.outputAllHistograms(motif_distribution, label1,
                                         null_distribution, label2,
                                         histogram_interval, all_histo_path)
                motif_stats.setAttribute(
                    MotifStatistics.MOTIF_DISTANCE_HISTOGRAM, all_histo_path)

                # Execute a chi2 test on the motif distribution against the motif homogen distribution
                chi2_test = RSATUtils.executeChi2Test(all_histo_path, 4, 5)
                if chi2_test != None:
                    motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2,
                                             chi2_test[0])
                    motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2_PVALUE,
                                             chi2_test[1])
                else:
                    motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2, "0.0")
                    motif_stats.setAttribute(MotifStatistics.MOTIF_CHI2_PVALUE,
                                             "1.0")

                # Build the PNG graph corresponding to all histograms using RSAT XYGraph command
                graph_path = os.path.join(
                    dir_path, motif_name + prefix_id + "_Distances.png")
                cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph")
                cmd += " -i '" + all_histo_path + "'"
                cmd += " -title1 '" + self.component.pipelineName + "'"
                cmd += " -title2 ''"
                #cmd += " -xcol 3 -ycol 4,5"
                cmd += " -xcol 3 -ycol 4"
                cmd += " -xleg1 'Distance to peak maximum'"
                cmd += " -yleg1 'Number of motif hits'"
                cmd += " -legend -header -format png -fhisto"
                cmd += " -o '" + graph_path + "'"

                cmd_result = commands.getstatusoutput(cmd)
                if cmd_result[0] != 0:
                    Log.log(
                        "HistogramProcessor.buildHistogramsAndGraphs : status returned is :"
                        + str(cmd_result[0]) + " for command '" + cmd + "'")
                    Log.log("  Command output is = \n" + str(cmd_result[1]))
                    continue

                motif_stats.setAttribute(
                    MotifStatistics.MOTIF_DISTANCE_HISTOGRAM_GRAPH, graph_path)

                # Build the PDF graph corresponding to all histograms using RSAT XYGraph command
                graph_path_pdf = os.path.join(
                    dir_path, motif_name + prefix_id + "_Distances.pdf")
                cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph")
                cmd += " -i '" + all_histo_path + "'"
                cmd += " -title1 '" + self.component.pipelineName + "'"
                cmd += " -title2 ''"
                #cmd += " -xcol 3 -ycol 4,5"
                cmd += " -xcol 3 -ycol 4"
                cmd += " -xleg1 'Distance to peak maximum'"
                cmd += " -yleg1 'Number of motif hits'"
                cmd += " -legend -header -format pdf -fhisto"
                cmd += " -o '" + graph_path_pdf + "'"

                cmd_result = commands.getstatusoutput(cmd)
                if cmd_result[0] != 0:
                    Log.log(
                        "HistogramProcessor.buildHistogramsAndGraphs : status returned is :"
                        + str(cmd_result[0]) + " for command '" + cmd + "'")
                    Log.log("  Command output is = \n" + str(cmd_result[1]))
                    continue

                motif_stats.setAttribute(
                    MotifStatistics.MOTIF_DISTANCE_HISTOGRAM_GRAPH_PDF,
                    graph_path_pdf)

                # Output the histogram of motif peak scores
                if len(hits_peakscore[motif_name]) > 1:
                    valuable = False
                    for value in hits_peakscore[motif_name]:
                        if value != 0:
                            valuable = True
                            break
                    if valuable:
                        score_histo_prefix = motif_name + prefix_id + "_PeakScores"
                        title1 = self.component.pipelineName
                        title2 = "Distribution of peak score for " + motif_name + prefix_id
                        legendx = "Peak Score"
                        legendy = "Number of occurence"
                        pathes = RSATUtils.outputHistogram(
                            hits_peakscore[motif_name], histogram_interval,
                            dir_path, score_histo_prefix, title1, title2,
                            legendx, legendy, None, True)
                        motif_stats.setAttribute(
                            MotifStatistics.MOTIF_PEAK_SCORE_HISTOGRAM,
                            pathes[0])
                        motif_stats.setAttribute(
                            MotifStatistics.MOTIF_PEAK_SCORE_HISTOGRAM_GRAPH,
                            pathes[1])

                # Update the progression
                if count_motif % 10 == 0:
                    ProgressionManager.setTaskProgression(
                        "Building motifs histogram", self.component,
                        count_motif / float(total_motif_number))

        except IOError, io_exce:
            raise ExecutionException(
                "HistogramProcessor.buildHistogramsAndGraphs : Unable to build histogram and graph. From:\n\t---> "
                + str(io_exce))
예제 #18
0
    def testFinalMSA(self, final_seq_align, maf_blocks, bed_sequence):

        # Note: all the indexes in this first part are coordinates in the genome

        # initialize the list that represents the succession of sequence from the MAF blocks
        long_seq = []

        # # if necessary, add dots at the beginning of the long_seq representing missing information
        seq_start = maf_blocks[0].sequences[self.referenceSpecies].indexStart
        if seq_start > bed_sequence.indexStart:
            long_seq.extend([Constants.SEQUENCE_INIT_CHAR] *
                            (seq_start - bed_sequence.indexStart))
        previous_end = seq_start

        # Compose the long_seq by the succession of sequence from MAF blocks
        for maf_block in maf_blocks:
            current_start = maf_block.sequences[
                self.referenceSpecies].indexStart
            # inserts dots between two sequences if they are not successive
            long_seq.extend(['.'] * (current_start - previous_end))
            # create a list from the MAF sequence text
            text_list = list(maf_block.sequences[self.referenceSpecies].text)
            # localize the insertion characters at the beginning of the list
            begin = 0
            for i in range(len(text_list)):
                if text_list[i] == Constants.SEQUENCE_INSERTION_CHAR:
                    begin += 1
                else:
                    break
            # localize the insertion characters at the end of the list
            end = len(text_list)
            for i in range(len(text_list) - 1):
                if text_list[-i - 1] == Constants.SEQUENCE_INSERTION_CHAR:
                    end = end - 1
                else:
                    break
            # insert the MAF sequence in the long_seq ignoring insertion character at the beginning and at the end
            long_seq.extend(text_list[begin:end])
            previous_end = current_start + maf_block.sequences[
                self.referenceSpecies].textLength

        # if necessary, add dots at the end of the long_seq representing missing information
        if previous_end < bed_sequence.indexEnd:
            long_seq.extend([Constants.SEQUENCE_INIT_CHAR] *
                            (bed_sequence.indexEnd - previous_end))

        # compute the index at which the BED sequence may start in the long_seq index coordinates
        if bed_sequence.indexStart < maf_blocks[0].sequences[
                self.referenceSpecies].indexStart:
            index_seq_start = 0
        else:
            index_seq_start = bed_sequence.indexStart - maf_blocks[
                0].sequences[self.referenceSpecies].indexStart

        # compute the index at which the BED sequence may end in the long_seq index coordinates
        index_seq_end = index_seq_start + bed_sequence.indexEnd - bed_sequence.indexStart

        # Note : in this second part, we have to consider that insertion characters exists in the sequence text
        # to compute the true star and end index of the BED sequence in the long_seq

        # modify the end index according to the number of insertion characters
        indice = 0
        index = 0
        count = 0
        while indice <= index_seq_start:
            if long_seq[index] != Constants.SEQUENCE_INSERTION_CHAR:
                indice += 1
            else:
                count += 1
            index += 1

        seq_index_start = index_seq_start + count

        # modify the end index according to the number of insertion characters
        indice = 0
        index = 0
        count = 0
        while indice < index_seq_end:
            if long_seq[index] != Constants.SEQUENCE_INSERTION_CHAR:
                indice += 1
            else:
                count += 1
            index += 1

        seq_index_end = index_seq_end + count

        # retrieve the sub-string of logn_seq that should represent the BED sequence
        result = long_seq[seq_index_start:seq_index_end]

        # compare the string obtained above with the one if the MSA composed in the previous method
        str_result = "".join(result)
        str_final = "".join(final_seq_align.sequences[self.referenceSpecies])

        # If the result are not equals, some thing is wrong
        if str_result != str_final:
            Log.log(
                "MAFProcessor.testFinalMSA: an error has been detected on the recomposed sequence"
            )
            Log.log("Composed MSA sequence = " + str_final)
            Log.log("Test MSA sequence = " + str_result)
            str_long = "".join(long_seq)
            # try to find the MSA sequence in the long_seq directly
            index_test = str_long.find(str_final)
            # if the MSA sequence is not found, we are facing a true issue
            if index_test < 0:
                Log.log(
                    "  The error is confirmed since the composed MSA sequence does not appear in the string composed by the succession of sequences from MAF file : "
                )
                Log.log("  Succession sequence = " + str_long)
                Log.log("  Bed seq start = " + str(bed_sequence.indexStart))
                Log.log("  Bed seq end = " + str(bed_sequence.indexEnd))
                Log.log("  Associated MAF blocks : ")
                for maf_block in maf_blocks:
                    Log.log(maf_block.toString())
            # if the MSA sequence is found, the error come from an index computation issue
            else:
                Log.log(
                    "The error is not confirmed since the composed MSA sequence appears at index "
                    + str(index_test) +
                    " in the string composed by the succession of sequences from MAF file:"
                )
                Log.log(str_long)
예제 #19
0
    def generateRandomMSA(self, msa_length, bedseq_number, max_length,
                          output_commstruct):

        # Retrieve method required parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        file_path = os.path.join(dir_path, "random_sequences.txt")

        try:
            # Execute the RSAT random-seq command
            cmd = os.path.join(RSAT_PATH, "perl-scripts/random-seq")
            cmd += " -l " + str(int(max_length * 1.5))
            cmd += " -n " + str(bedseq_number)
            cmd += " -a a:t 0.3 c:g 0.2"
            cmd += " -type DNA"
            cmd += " -format multi"
            cmd += " -o " + file_path

            Log.info(
                "GenerateMSAProcessor.generateMSA : starting random sequence generation. Command used is : "
                + cmd)

            # Execute the command
            cmd_result = commands.getstatusoutput(cmd)
            if cmd_result[0] != 0:
                Log.log(
                    "GenerateMSAProcessor.generateMSA : status returned is :" +
                    str(cmd_result[0]) + " for command '" + cmd + "'")
                Log.log(
                    "GenerateMSAProcessor.generateMSA : command output is = \n"
                    + str(cmd_result[1]))
                raise ExecutionException(
                    "GenerateMSAProcessor.generateMSA : Cannot execute random-seq commands. See logs for more details"
                )

            # Read the output file to get the random sequences
            sequence_list = []
            sequence_file = open(file_path, "r")
            for line in sequence_file:
                sequence_list.append(line.split()[0])

            # Generate  the species list
            species_list = []
            species_list.append(output_commstruct.baseSpecies)
            for index in range(msa_length - 1):
                species_list.append("Species" + str(index + 1))

            # Create and fill the MSA for each BED sequence
            count_seq = 0
            for chrom in output_commstruct.bedSequencesDict.keys():
                for bedseq in output_commstruct.bedSequencesDict[chrom]:
                    msa = SequenceAlignment()
                    msa.name = bedseq.name + "_1"
                    msa.referenceSpecies = output_commstruct.baseSpecies
                    seq_length = bedseq.indexEnd - bedseq.indexStart
                    sequence = list(sequence_list[count_seq][:seq_length])
                    for index in range(msa_length):
                        msa.addSequence(species_list[index], sequence)
                        #msa.addSequence( species_list[index], list(['.'] * len( sequence)))
                    msa.finalizeSequences()
                    output_commstruct.addSequenceAlignment(bedseq, msa)
                    count_seq += 1

        except IOError, io_exce:
            raise ExecutionException(
                "GenerateMSAProcessor.generateMSA : Unable to save/read random sequences file. From:\n\t---> "
                + str(io_exce))
예제 #20
0
    def parseBlock(self, input_file, indexed=False):

        new_block = None

        #Search for the first sequence line of the block and verify if the block match
        # with at least one of the BED sequence
        while 1:
            line = input_file.readline()
            if len(line) == 0:
                break
            # Check if the line is not void
            elif not line.isspace():
                tokens = line.split()
                # Check if the line contains enough tokens
                if tokens != None and len(tokens) > MAFProcessor._text_col:
                    if tokens[MAFProcessor._lineType_col] == 's':
                        # Verify if current sequence species match with reference species
                        spec_chrom = SequenceUtils.getSpeciesAndChrom(
                            tokens[MAFProcessor._speciesChrom_col])
                        species = spec_chrom[0]
                        chromosom = spec_chrom[1]
                        if species == self.referenceSpecies:
                            # Search for BED Sequences having the same <species>.<chromosom>
                            bed_sequences = self.getAssociatedBEDSequences(
                                species + "." + chromosom)
                            if bed_sequences != None and len(
                                    bed_sequences) > 0:
                                strand = tokens[MAFProcessor._strand_col]
                                bp_start = self.computeStartIndex(
                                    tokens, strand)
                                text_length = self.getIntValue(
                                    tokens[MAFProcessor._textlength_col])
                                # Search for BEDSequences intersection the current sequence
                                new_block = self.findMatchingBEDSequences(
                                    bed_sequences, bp_start, text_length,
                                    strand)
                                if new_block != None:
                                    text = tokens[MAFProcessor._text_col]
                                    new_block.addSequence(
                                        species, chromosom, bp_start,
                                        text_length, text)
                                    break
                                else:
                                    # This block does not intersect any BEDSequence. If indexation is used, something is wrong
                                    if indexed == True:
                                        return False
                                    else:
                                        return True
                            else:
                                # This block does not match the chromosom of any BED Sequences
                                # Parsing must be stopped if the file contain only information of one chromosom
                                # Alert is raised in case of index file is used
                                if indexed == True:
                                    Log.log(
                                        "MAFProcessor.parseBlock : No BED sequences corresponds to this MSA Block"
                                    )
                                return False
                        else:
                            Log.log(
                                "MAFProcessor.parseBlock : The first sequence of the parsed block does not correspond to the reference species : "
                                + line)
                            return False
            else:
                # This block is void but we have to continue the parsing
                return True

        # If the block matches with at least one of the BED sequence,
        # parses the rest of the block and store the information
        if new_block != None:
            while 1:
                line = input_file.readline()
                if len(line) == 0:
                    break
                elif not line.isspace():
                    tokens = line.split()
                    if tokens != None and len(tokens) > MAFProcessor._text_col:
                        if tokens[MAFProcessor._lineType_col] == 's':
                            spec_chrom = SequenceUtils.getSpeciesAndChrom(
                                tokens[MAFProcessor._speciesChrom_col])
                            species = spec_chrom[0]
                            chromosom = spec_chrom[1]
                            if len(self.desiredSpeciesList) == 0 or (
                                    len(self.desiredSpeciesList) > 0
                                    and species in self.desiredSpeciesList):
                                bp_start = self.getIntValue(
                                    tokens[MAFProcessor._startindex_col])
                                text_length = self.getIntValue(
                                    tokens[MAFProcessor._textlength_col])
                                text = tokens[MAFProcessor._text_col]
                                new_block.addSequence(species, chromosom,
                                                      bp_start, text_length,
                                                      text)
                            if not species in self.parsedSpeciesList:
                                self.parsedSpeciesList.append(species)
                #Block ends at the first empty line
                else:
                    break
        else:
            Log.log(
                "MAFProcessor.parseBlock : The parsed block does not contains any sequence (line starting with 's')"
            )
            return False

        return True
예제 #21
0
    def parseBlockListWithIndex(self, index_file, input_file):

        is_chrom_file = False
        ordered = False
        spec_chrom = None

        # Read the index file header to know if the file is chromosom specialized and ordered
        while 1:
            line = index_file.readline()
            if len(line) == 0:
                Log.log("MAFProcessor.parseBlockListWithIndex : index file '" +
                        index_file.name + "' has no header line : skipping it")
                return
            else:
                tokens = line.split()
                if tokens != None and tokens[0] == Constants.COMMENT_CHAR:
                    if len(tokens) > 1 and tokens[1] != Constants.MIXED:
                        is_chrom_file = True
                        spec_chrom = tokens[1]
                        if len(tokens) > 2 and tokens[2] == Constants.ORDERED:
                            ordered = True
                break

        # If the file is specialized by chromosom, get once for all the bed sequences concerned
        # by this species and chromosom
        if is_chrom_file == True:
            bed_sequences = self.getAssociatedBEDSequences(spec_chrom)
            if bed_sequences == None or len(bed_sequences) == 0:
                Log.info(
                    "MAFProcessor.parseBlockListWithIndex : No BED sequences matching for file :"
                    + index_file.name)
                return
        else:
            bed_sequences = None

        # If file is ordered, compute the peaks extremum in order to optimize the parsing
        if ordered == True:
            min_start = 1000000000
            max_end = 0
            for bed_sequence in bed_sequences:
                if bed_sequence.indexStart < min_start:
                    min_start = bed_sequence.indexStart
                if bed_sequence.indexEnd > max_end:
                    max_end = bed_sequence.indexEnd

        # Parse the index file
        while 1:
            line = index_file.readline()
            if len(line) == 0:
                break
            else:
                tokens = line.split()
                if tokens != None and len(tokens) == 4:
                    # retrieve the index information
                    spec_chrom = tokens[0]
                    start = self.getIntValue(tokens[1])
                    end = self.getIntValue(tokens[2])
                    position = self.getIntValue(tokens[3])

                    if ordered == True:
                        # If the file is ordered and the indexes are less than the BED indexes, skip the line
                        if end <= min_start:
                            continue
                        # If the file is ordered and the indexes are greater than the BED indexes, skip the file
                        elif start >= max_end:
                            break
                        # If the indexes are at least in one of the BED sequences index range,
                        # the corresponding MSA block is parsed
                        else:
                            for bed_sequence in bed_sequences:
                                if end > bed_sequence.indexStart and start < bed_sequence.indexEnd:
                                    input_file.seek(position, 0)
                                    result = self.parseBlock(input_file, True)
                                    if result == False:
                                        raise ExecutionException(
                                            "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes. Please see logs for more information"
                                        )
                                    break
                    else:
                        # If the file is not chromosom specialized, the bed sequence list must be
                        # retrieve for each new index
                        if is_chrom_file == False:
                            bed_sequences = self.getAssociatedBEDSequences(
                                spec_chrom)
                            if (bed_sequences == None
                                    or len(bed_sequences) == 0):
                                continue

                        # If the indexes are at least in one of the BED sequences index range,
                        # the corresponding MSA block is parsed
                        for bed_sequence in bed_sequences:
                            if end > bed_sequence.indexStart and start < bed_sequence.indexEnd:
                                input_file.seek(position, 0)
                                result = self.parseBlock(input_file, True)
                                if result == False:
                                    Log.log(
                                        "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes"
                                    )
                                    raise ExecutionException(
                                        "MAFFile.parseBlockListWithIndex : Indexed MSA block seems not correct. You should have not updated indexes. Please, see logs for more information"
                                    )
                                break
    # --------------------------------------------------------------------------------------
    # Write the CommStruct to the given XML file
    def toXMLFile(self, output_filepath):

        try:
            root_element = self.convertCommStructToElementTree()
            self.indent(root_element, 0)
            ElementTree(root_element).write(output_filepath)
        except IOError, exce:
            Log.log(
                "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to write CommStruct to XML file. From:\n\t---> "
                + str(exce))
        except ParsingException, par_exce:
            Log.log(
                "BedSeqAlignmentStatsCommStruct.toXMLFile : Unable to save CommStruct to XML file. From:\n\t---> "
                + str(par_exce))

    # #############################
    # METHODS TO READ THE XLM FILE
    # #############################

    # --------------------------------------------------------------------------------------
    @staticmethod
    def getCommStructFromXML(commstruct_filepath):

        commstruct_file = None
        root_element = None

        try:
            commstruct_file = open(commstruct_filepath, "r")
예제 #23
0
                pipeline_element.append( component_element)
                component_element.attrib[ PipelineXMLParser.COMPONENT_PROCESSOR_ATT] = component.processorName
                for param_name, param_value in component.parameters.iteritems():
                    param_element = Element( PipelineXMLParser.PARAM_TAG)
                    component_element.append( param_element)
                    param_element.attrib[ PipelineXMLParser.PARAM_NAME_ATT] = str( param_name)
                    param_element.attrib[ PipelineXMLParser.PARAM_VALUE_ATT] = str( param_value)

        try:
            PipelineXMLParser.indent( pipelines_element, 0)
	    outfile = os.path.join( outpath, pipeline.name + ".xml")
            ElementTree( pipelines_element).write( outfile)
        except IOError, exce:
            Log.log( "PipelineXMLParser.toXMLFile : Unable to write Pipelines to XML file. From:\n\t---> " + str( exce))
        except ParsingException, par_exce:
            Log.log( "PipelineXMLParser.toXMLFile : Unable to save Pipelines to XML file. From:\n\t---> " + str( par_exce))


    # --------------------------------------------------------------------------------------
    # Add indentation to the ElementTree in order to have a pretty print
    # in the XML file (used by subclasses)
    @staticmethod
    def indent( elem, level=0):
            
        i = "\n" + level*"  "
        if len(elem):
            if not elem.text or not elem.text.strip():
                elem.text = i + "  "
            if not elem.tail or not elem.tail.strip():
                elem.tail = i
            for elem in elem:
예제 #24
0
    def getRequiredParameters():

        Log.log(
            "The method 'getRequiredParameters' must be implemented at the inherited class level"
        )
        return None
예제 #25
0
    def getOutputCommStructClass():

        Log.log(
            "The method 'getOutputCommStructClass' must be implemented at the inherited class level"
        )
        return ("Not defined", )
예제 #26
0
                        Log.trace("##")
                        Log.trace("## Starting component '" +
                                  current_component.getComponentPrefix() + "'")
                        if resume:
                            Log.trace("## Resume mode")
                        else:
                            Log.trace("## Forcing mode")
                        Log.trace("##")

                        # Execute the component
                        try:
                            executed = current_component.start(
                                pipeline, pipeline_output, self.config, resume)
                        except ExecutionException, exe_exce:
                            Log.log(
                                "PipelineManager.executePipelines : Aborting execution of component : '"
                                + current_component.getComponentPrefix() +
                                "' . From:\n\t---> " + str(exe_exce))

                        # If the component was not correctly executed, its next component are not passed to queue
                        if not executed:
                            Log.log(
                                "PipelineManager.executePipelines : Component : '"
                                + current_component.getComponentPrefix +
                                "' was not executed. See logs for more details"
                            )
                        else:
                            # remove the component from queue and add its following component to the queue start (depth-first execution)
                            component_queue_list.remove(current_component)
                            components_to_add = []
                            for next_component in current_component.nextComponents:
                                if not next_component in component_queue_list:
예제 #27
0
    def getDisplayName():

        Log.log(
            "The method 'getOutputCommStructClass' must be implemented at the inherited class level"
        )
        return Processor.__class__.__name__ + " (no display name defined)"
    def compareMotifHistogram(self, input_commstructs):

        # Retrieve the required parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)

        # List the motifs that are commons to all input CommStructs
        common_motifs = self.getCommonMotifs(input_commstructs)

        # List the label of each origins
        labels = self.getLabels(input_commstructs)

        number_inputs = len(input_commstructs)

        # Prepare the output directories
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        shutil.rmtree(dir_path, True)
        FileUtils.createDirectory(dir_path, 0777)

        for motif_name in common_motifs.keys():
            motifs = common_motifs[motif_name]
            full_histogram = {}
            full_null_histogram = {}
            for index in range(len(motifs)):
                motif = motifs[index]
                if motif != None:
                    # Add entries for the motif histogram for the current previous output
                    motif_histogram = input_commstructs[
                        index].motifToStatistics[motif].histogram
                    if motif_histogram != None:
                        for token in motif_histogram:
                            xy = token.split(MotifStatisticsCommStruct.
                                             HISTOGRAM_VALUE_SEPARATOR_CHAR)
                            x = xy[0]
                            y = self.getTokenAsfloat(xy[1])
                            if not x in full_histogram.keys():
                                full_histogram[x] = [0.0] * number_inputs
                            full_histogram[x][index] = y
                    # Add entries for the motif null histogram for the current previous output
                    motif_null_histogram = input_commstructs[
                        index].motifToStatistics[motif].nullHistogram
                    if motif_null_histogram != None:
                        for token in motif_null_histogram:
                            xy = token.split(MotifStatisticsCommStruct.
                                             HISTOGRAM_VALUE_SEPARATOR_CHAR)
                            x = xy[0]
                            y = self.getTokenAsfloat(xy[1])
                            if not x in full_null_histogram.keys():
                                full_null_histogram[x] = [0.0] * number_inputs
                            full_null_histogram[x][index] = y

            if len(full_histogram) > 0:
                try:
                    # Output the histogram values to file
                    file_name = motif_name + "_full_histogram"
                    file_path = os.path.join(dir_path, file_name + ".tab")
                    file = open(file_path, "w")
                    # Write the headers in the file
                    file.write("# x")
                    for label in labels:
                        file.write("\t" + motif_name + " (" + label + ")")
                    for label in labels:
                        file.write("\tHomogeneous model (" + label + ")")
                    file.write("\n")
                    # Write the data in the file
                    while len(full_histogram) > 0:
                        # Search for the littlest x in the dictionnary keys
                        x_val_min = 10000
                        for x in full_histogram.keys():
                            x_val = self.getTokenAsfloat(x)
                            if x_val < x_val_min:
                                x_min = x
                                x_val_min = x_val
                        file.write(x_min)
                        # write the y values of motif histograms corresponding to the min x found
                        for y in full_histogram[x_min]:
                            file.write("\t" + str(y))
                        del full_histogram[x_min]
                        # write the y values of null histograms corresponding to the min x found
                        if x_min in full_null_histogram.keys():
                            for y in full_null_histogram[x_min]:
                                file.write("\t" + str(y))
                            del full_null_histogram[x_min]
                        else:
                            for y in range(number_inputs):
                                file.write("\t0.0")
                        file.write("\n")

                    file.flush()
                    file.close()

                    # Draw the histogram graph
                    graph_path = os.path.join(dir_path,
                                              file_name + "_graph.png")
                    value_cols = ""
                    for index in range(number_inputs):
                        value_cols += str(index + 2) + ","
                    for index in range(number_inputs):
                        value_cols += str(number_inputs + index + 2) + ","
                    value_cols = value_cols[:-1]
                    cmd = os.path.join(RSAT_PATH, "perl-scripts/XYgraph")
                    cmd += " -i " + file_path
                    cmd += " -title1 'Global distribution over peaks for " + motif_name + "'"
                    cmd += " -xcol 1 -ycol " + value_cols
                    cmd += " -xleg1 'Position against peak maximum' -lines"
                    cmd += " -yleg1 'Number of occurence'"
                    cmd += " -legend -header -format png -histo"
                    cmd += " -o " + graph_path

                    cmd_result = commands.getstatusoutput(cmd)
                    if cmd_result[0] != 0:
                        Log.log(
                            "CompareStatisticsProcessor.compareMotifHistogram : status returned is :"
                            + str(cmd_result[0]) + " for command '" + cmd +
                            "'")
                        Log.log("  Command output is = \n" +
                                str(cmd_result[1]))
                        continue
                except IOError, io_exce:
                    raise ExecutionException(
                        "CompareStatisticsProcessor.compareMotifHistogram : Unable to save histogram to tab file : '"
                        + file_path + "'. From:\n\t---> " + str(io_exce))
예제 #29
0
    def execute(self, comm_struct, pipeline):

        Log.log(
            "The method 'execute' must be implemented at the inherited class level"
        )
        return None
예제 #30
0
    def execute(self, input_commstructs):
        
        if input_commstructs == None or len(input_commstructs) == 0:
            raise ExecutionException("BEDOutputProcessor.execute : No inputs")
        
        input_commstruct = input_commstructs[0]
        
        # Retrieve the processor parameters
        reference_motif = self.getParameter(BEDOutputProcessor.REFERENCE_MOTIF)
                
        color_method = self.getParameter(BEDOutputProcessor.COLOR_METHOD, False)
        if color_method == None:
            color_method = BEDOutputProcessor.COLOR_METHOD_SCORE
        else:
            color_method = color_method.lower()
            if color_method != BEDOutputProcessor.COLOR_METHOD_SCORE and color_method != BEDOutputProcessor.COLOR_METHOD_FAMILY:
                color_method = BEDOutputProcessor.COLOR_METHOD_SCORE
                
        score_min = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MIN)
        score_max = self.getParameterAsfloat(BEDOutputProcessor.SCORE_MAX)
        
        # Prepare the processor output dir
        out_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix())
        shutil.rmtree(out_path, True)
        FileUtils.createDirectory( out_path, 0777)

        # Retrieve the JASPAR motifs details
        motif_details = MotifUtils.getMotifsDetailsFromJaspar()
        motif_id = motif_details[ 0]
        motif_family = motif_details[ 1]
        family_rgb = {}

        # build the bed output file path
        bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bed")

        try:
            bed_file = open(bed_file_path, "w")

            #bed_file.write("track name='" + self.component.pipelineName + "' visibility=3 itemRgb='On' use_score=1\n")
            #bed_file.write("browser dense RSAT\n")
            #bed_file.write("browser dense\n") 
            #bed_file.write("## seq_name	start	end	feature_name	score	strand	thickStart	thickEnd	itemRgb	blockCount	blockSizes	blckStarts\n")

            current_color = None
            bedseq_list = input_commstruct.bedToMA.keys()
            bedseq_list.sort(BEDSequence.compare)
            previous_line_start = 0
            previous_line_key = ""
            for bed_seq in bedseq_list:
                for msa in input_commstruct.bedToMA[ bed_seq]:
                    for motif in msa.motifs:
                        motif_name = motif.name
                        if not input_commstruct.motifStatistics.has_key(motif_name):
                            continue
                        if motif_name in motif_id.keys():
                            out_name = motif_id[ motif_name]
                            chromosom = bed_seq.chromosom
                            start_position = bed_seq.indexStart + msa.fixIndex(motif.indexStart)
                            end_position = bed_seq.indexStart + msa.fixIndex(motif.indexEnd)
                            score = motif.score
                            
                            # Commented : Black is assigned to the reference motif
                            #if motif_name == reference_motif:
                            #    item_rgb = "0,0,0"
                            # for the other motif, color depends on the chosen method
                            #else:
                            if color_method == BEDOutputProcessor.COLOR_METHOD_FAMILY:
                                if motif_name in motif_family.keys():
                                    #print("-----------------------------")
                                    #print "Current color = " + str(current_color)
                                    #print "Motif name=" + motif_name
                                    #print "Motif family=" + motif_family[ motif_name]
                                    family_rgb = self.updateFamilyRGB(motif_family[ motif_name], family_rgb, current_color)
                                    #print "Family RGB = " + str(family_rgb)
                                    item_rgb = family_rgb[ motif_family[ motif_name]]
                                    #print "Item rgb = ", str(item_rgb)
                                    current_color = item_rgb
                                else:
                                    item_rgb = BEDOutputProcessor.COLORS[ 0]
                            else:
                                item_rgb = self.getColorForScore(score, score_min, score_max)
                            
                            # Write the lines to output file
                            if len( chromosom) <4:
                                line_out = "chr" + chromosom
                            else:
                                line_out = chromosom
                            line_out += "\t" + str(start_position)
                            line_out += "\t" + str(end_position)
                            line_out += "\t" + out_name
                            line_out += "\t" + str(int(score * 1000))
                            line_out += "\t" + motif.strand
                            line_out += "\t" + str(start_position)           # ThickStart
                            line_out += "\t" + str(end_position)            # ThickEnd
                            line_out += "\t" + item_rgb        # itemRGB
                            #line_out += "\t" + "0"            # BlockCount
                            #line_out += "\t" + "0"            # BlockSizes
                            #line_out += "\t" + "0"            # BlockStarts
                            
                            # Build a key that represent the motif chrom,  name and positions
                            line_key = chromosom + ":" + str(start_position) + ":" + str(end_position) + ":" + out_name
                            
                            # If the new line has the same key has the previous one, we must keep only one of the two lines
                            # i.e. the one with the highest score (the tell() and seek() method permits to overwrite the old line
                            # line if required.
                            # If the new line and the previous one has different keys the new line is simply written
                            if previous_line_key != line_key:
                                previous_line_start = bed_file.tell()
                                bed_file.write(line_out)
                                bed_file.write("\n")
                                bed_file.flush
                                previous_line_key = line_key
                                previous_score = score
                            else:
                                if score > previous_score:
                                    bed_file.seek(previous_line_start)
                                    bed_file.write(line_out)
                                    bed_file.write("\n")
                                    bed_file.flush
                                    previous_score = score     

            bed_file.close()
            input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BED_OUTPUT_PATH] = bed_file_path
            
            # Sort bed_file (used for bigBed conversion)
            sorted_bed_file_path = os.path.join(out_path, self.component.pipelineName + "_Motifs_sorted.bed")
            cmd = "sort -k1,1 -k2,2n"
            cmd += " " + bed_file_path
            cmd += " > " + sorted_bed_file_path
            
            Log.info( "BEDOuputProcessor.execute : Sorting BED file")
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
                        
            # Fetch the chrom sizes that will be use to convert BED file to bigBed file
            chrom_sizes_path = os.path.join(out_path, self.component.pipelineName + "_chrom_size.txt")
            
            RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM)
            cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/fetchChromSizes")
            cmd += " " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES]
            cmd += " > " + chrom_sizes_path
            
            Log.info( "BEDOuputProcessor.execute : Fetching Chrom sizes for species : " + input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.REFERENCE_SPECIES])
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
            
            # Build the bigBed file
            # sudo ln -s /lib/x86_64-linux-gnu/libssl.so.1.0.0 /usr/lib/libssl.so.10
            # sudo ln -s /lib/x86_64-linux-gnu/libcrypto.so.1.0.0 /usr/lib/libcrypto.so.10
            
            big_bed_path = os.path.join(out_path, self.component.pipelineName + "_Motifs.bb")
            
            RSAT_PATH = self.component.getParameter( Constants.RSAT_DIR_PARAM)
            cmd = os.path.join( RSAT_PATH , "contrib/peak-footprints/tools/bedToBigBed")
            cmd += " " + sorted_bed_file_path
            cmd += " " + chrom_sizes_path
            cmd += " " + big_bed_path
            
            Log.info( "BEDOuputProcessor.execute : Converting BED file to bigBed file")
            Log.info( "BEDOuputProcessor.execute  : command used is : " + cmd)
            
            cmd_result = commands.getstatusoutput( cmd)
            Log.trace( "BEDOuputProcessor.execute : " + threading.currentThread().getName() + " : status returned is :" + str( cmd_result[0]))
            if cmd_result[0] != 0:
                Log.log( "BEDOuputProcessor.execute : status returned is :" + str( cmd_result[0]) + " for command '" + cmd + "'" )
                Log.log( "BEDOuputProcessor.execute : command output is = \n" + str( cmd_result[1]))
                return input_commstruct
                        
            input_commstruct.paramStatistics[ BedSeqAlignmentStatsCommStruct.BIGBED_OUTPUT_PATH] = big_bed_path
            
        except IOError, io_exce:
            Log.log("BEDOutputProcessor.execute : Unable to save the BED file of recognized motifs : " + str(io_exce))