Пример #1
0
    def parseClustalWResult(self, file_path, desired_species_list):

        try:
            result = {}
            length = 0
            file = open(file_path, "r")
            for line in file:
                tokens = line.split()
                if tokens != None and len(tokens) == 2:
                    species = tokens[0]
                    if desired_species_list == None or len(
                            desired_species_list
                    ) == 0 or species in desired_species_list:
                        if not species in result.keys():
                            result[species] = []
                        result[species].extend(tuple(tokens[1]))
                        length = len(result[species])

            result = self.removeFirstAndLastNoInfoColumns(result, length)

            alignment = SequenceAlignment()
            for species in result:
                alignment.addSequence(species, result[species])
            file.close()
            return alignment

        except IOError, io_exce:
            raise ExecutionException(
                "MSAProcessor.parseClustalWResult : Unable to open the ClustalW result file : '"
                + file_path + "'. From:\n\t---> " + str(io_exce))
Пример #2
0
 def getAlignment( align_node, bedseq, comm_struct):
     
     name = CommStruct.getAttribute( align_node, BedSeqAlignmentStatsCommStruct.ALIGNMENT_NAME_ATT)
     seq_align = SequenceAlignment()
     seq_align.name = name
     seq_align.referenceSpecies = bedseq.species
     comm_struct.addSequenceAlignment( bedseq, seq_align)
     
     return seq_align
Пример #3
0
    def generateTrivialMSA(self, msa_length, bedseq_number, output_commstruct):

        # Generate  the species list
        species_list = []
        species_list.append(output_commstruct.baseSpecies)
        for index in range(msa_length - 1):
            species_list.append("Species" + str(index + 1))

        # Create and fill the MSA for each BED sequence
        for chrom in output_commstruct.bedSequencesDict.keys():
            for bedseq in output_commstruct.bedSequencesDict[chrom]:
                msa = SequenceAlignment()
                msa.name = bedseq.name + "_1"
                seq_length = bedseq.indexEnd - bedseq.indexStart
                sequence = list(['.'] * seq_length)
                for index in range(msa_length):
                    msa.addSequence(species_list[index], sequence)
                msa.finalizeSequences()
                output_commstruct.addSequenceAlignment(bedseq, msa)
Пример #4
0
    def composeSequenceAlignment(self, bed_sequence, keep_gaps):

        final_seq_align = SequenceAlignment()
        final_seq_align.name = bed_sequence.name + "_1"
        final_seq_align.initializeWithDots(bed_sequence.getLength(),
                                           self.referenceSpecies,
                                           self.desiredSpeciesList)
        for maf_block in self.mafBlockDic[bed_sequence]:
            ref_dna_seq = maf_block.sequences[self.referenceSpecies]
            # Determine the start and end indexes (related to the string) of the intersection between
            # the BED sequence and the reference DNA sequence
            dna_index_start = max(
                bed_sequence.indexStart - ref_dna_seq.indexStart, 0)
            dna_index_end = min(bed_sequence.indexEnd - ref_dna_seq.indexStart,
                                ref_dna_seq.textLength)

            # Compute the start and end index the block must be placed to the final alignment text
            bed_index_start = max(
                ref_dna_seq.indexStart - bed_sequence.indexStart, 0)
            bed_index_end = bed_index_start + (dna_index_end - dna_index_start)

            # Modify the start and end indexes according to the number of "-" present
            # in the sequence text to catch all the required letters
            current_index = 0
            while current_index <= dna_index_start:
                if ref_dna_seq.text[
                        current_index] == Constants.SEQUENCE_INSERTION_CHAR:
                    dna_index_start += 1
                    dna_index_end += 1
                current_index += 1

            while current_index < dna_index_end:
                if ref_dna_seq.text[
                        current_index] == Constants.SEQUENCE_INSERTION_CHAR:
                    dna_index_end += 1
                current_index += 1

            # Modify the bed_index_start and bed_index_end according to
            # the number of "-" present in the sequence text
            # of the reference species currently present in the alignment

            count = 0
            indice = 0
            current = 0
            while indice <= bed_index_start:
                if final_seq_align.sequences[self.referenceSpecies][
                        current] == Constants.SEQUENCE_INSERTION_CHAR:
                    count += 1
                else:
                    indice += 1
                current += 1
            bed_index_start += count
            bed_index_end += count

            # Insert the block of sequence of each species in the final alignement
            for species in maf_block.sequences.keys():
                final_seq_align.insertSequenceBlock(
                    species, bed_index_start, bed_index_end, maf_block.
                    sequences[species].text[dna_index_start:dna_index_end])

        # Cross algorithm to test the composition of the sequence
        #self.testFinalMSA( final_seq_align, self.mafBlockDic[ bed_sequence], bed_sequence)

        final_seq_align.finalizeSequences(keep_gaps)

        return final_seq_align
Пример #5
0
    def generateRandomMSA(self, msa_length, bedseq_number, max_length,
                          output_commstruct):

        # Retrieve method required parameters
        RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM)
        dir_path = os.path.join(self.component.outputDir,
                                self.component.getComponentPrefix())
        file_path = os.path.join(dir_path, "random_sequences.txt")

        try:
            # Execute the RSAT random-seq command
            cmd = os.path.join(RSAT_PATH, "perl-scripts/random-seq")
            cmd += " -l " + str(int(max_length * 1.5))
            cmd += " -n " + str(bedseq_number)
            cmd += " -a a:t 0.3 c:g 0.2"
            cmd += " -type DNA"
            cmd += " -format multi"
            cmd += " -o " + file_path

            Log.info(
                "GenerateMSAProcessor.generateMSA : starting random sequence generation. Command used is : "
                + cmd)

            # Execute the command
            cmd_result = commands.getstatusoutput(cmd)
            if cmd_result[0] != 0:
                Log.log(
                    "GenerateMSAProcessor.generateMSA : status returned is :" +
                    str(cmd_result[0]) + " for command '" + cmd + "'")
                Log.log(
                    "GenerateMSAProcessor.generateMSA : command output is = \n"
                    + str(cmd_result[1]))
                raise ExecutionException(
                    "GenerateMSAProcessor.generateMSA : Cannot execute random-seq commands. See logs for more details"
                )

            # Read the output file to get the random sequences
            sequence_list = []
            sequence_file = open(file_path, "r")
            for line in sequence_file:
                sequence_list.append(line.split()[0])

            # Generate  the species list
            species_list = []
            species_list.append(output_commstruct.baseSpecies)
            for index in range(msa_length - 1):
                species_list.append("Species" + str(index + 1))

            # Create and fill the MSA for each BED sequence
            count_seq = 0
            for chrom in output_commstruct.bedSequencesDict.keys():
                for bedseq in output_commstruct.bedSequencesDict[chrom]:
                    msa = SequenceAlignment()
                    msa.name = bedseq.name + "_1"
                    msa.referenceSpecies = output_commstruct.baseSpecies
                    seq_length = bedseq.indexEnd - bedseq.indexStart
                    sequence = list(sequence_list[count_seq][:seq_length])
                    for index in range(msa_length):
                        msa.addSequence(species_list[index], sequence)
                        #msa.addSequence( species_list[index], list(['.'] * len( sequence)))
                    msa.finalizeSequences()
                    output_commstruct.addSequenceAlignment(bedseq, msa)
                    count_seq += 1

        except IOError, io_exce:
            raise ExecutionException(
                "GenerateMSAProcessor.generateMSA : Unable to save/read random sequences file. From:\n\t---> "
                + str(io_exce))