def parseClustalWResult(self, file_path, desired_species_list): try: result = {} length = 0 file = open(file_path, "r") for line in file: tokens = line.split() if tokens != None and len(tokens) == 2: species = tokens[0] if desired_species_list == None or len( desired_species_list ) == 0 or species in desired_species_list: if not species in result.keys(): result[species] = [] result[species].extend(tuple(tokens[1])) length = len(result[species]) result = self.removeFirstAndLastNoInfoColumns(result, length) alignment = SequenceAlignment() for species in result: alignment.addSequence(species, result[species]) file.close() return alignment except IOError, io_exce: raise ExecutionException( "MSAProcessor.parseClustalWResult : Unable to open the ClustalW result file : '" + file_path + "'. From:\n\t---> " + str(io_exce))
def getAlignment( align_node, bedseq, comm_struct): name = CommStruct.getAttribute( align_node, BedSeqAlignmentStatsCommStruct.ALIGNMENT_NAME_ATT) seq_align = SequenceAlignment() seq_align.name = name seq_align.referenceSpecies = bedseq.species comm_struct.addSequenceAlignment( bedseq, seq_align) return seq_align
def generateTrivialMSA(self, msa_length, bedseq_number, output_commstruct): # Generate the species list species_list = [] species_list.append(output_commstruct.baseSpecies) for index in range(msa_length - 1): species_list.append("Species" + str(index + 1)) # Create and fill the MSA for each BED sequence for chrom in output_commstruct.bedSequencesDict.keys(): for bedseq in output_commstruct.bedSequencesDict[chrom]: msa = SequenceAlignment() msa.name = bedseq.name + "_1" seq_length = bedseq.indexEnd - bedseq.indexStart sequence = list(['.'] * seq_length) for index in range(msa_length): msa.addSequence(species_list[index], sequence) msa.finalizeSequences() output_commstruct.addSequenceAlignment(bedseq, msa)
def composeSequenceAlignment(self, bed_sequence, keep_gaps): final_seq_align = SequenceAlignment() final_seq_align.name = bed_sequence.name + "_1" final_seq_align.initializeWithDots(bed_sequence.getLength(), self.referenceSpecies, self.desiredSpeciesList) for maf_block in self.mafBlockDic[bed_sequence]: ref_dna_seq = maf_block.sequences[self.referenceSpecies] # Determine the start and end indexes (related to the string) of the intersection between # the BED sequence and the reference DNA sequence dna_index_start = max( bed_sequence.indexStart - ref_dna_seq.indexStart, 0) dna_index_end = min(bed_sequence.indexEnd - ref_dna_seq.indexStart, ref_dna_seq.textLength) # Compute the start and end index the block must be placed to the final alignment text bed_index_start = max( ref_dna_seq.indexStart - bed_sequence.indexStart, 0) bed_index_end = bed_index_start + (dna_index_end - dna_index_start) # Modify the start and end indexes according to the number of "-" present # in the sequence text to catch all the required letters current_index = 0 while current_index <= dna_index_start: if ref_dna_seq.text[ current_index] == Constants.SEQUENCE_INSERTION_CHAR: dna_index_start += 1 dna_index_end += 1 current_index += 1 while current_index < dna_index_end: if ref_dna_seq.text[ current_index] == Constants.SEQUENCE_INSERTION_CHAR: dna_index_end += 1 current_index += 1 # Modify the bed_index_start and bed_index_end according to # the number of "-" present in the sequence text # of the reference species currently present in the alignment count = 0 indice = 0 current = 0 while indice <= bed_index_start: if final_seq_align.sequences[self.referenceSpecies][ current] == Constants.SEQUENCE_INSERTION_CHAR: count += 1 else: indice += 1 current += 1 bed_index_start += count bed_index_end += count # Insert the block of sequence of each species in the final alignement for species in maf_block.sequences.keys(): final_seq_align.insertSequenceBlock( species, bed_index_start, bed_index_end, maf_block. sequences[species].text[dna_index_start:dna_index_end]) # Cross algorithm to test the composition of the sequence #self.testFinalMSA( final_seq_align, self.mafBlockDic[ bed_sequence], bed_sequence) final_seq_align.finalizeSequences(keep_gaps) return final_seq_align
def generateRandomMSA(self, msa_length, bedseq_number, max_length, output_commstruct): # Retrieve method required parameters RSAT_PATH = self.component.getParameter(Constants.RSAT_DIR_PARAM) dir_path = os.path.join(self.component.outputDir, self.component.getComponentPrefix()) file_path = os.path.join(dir_path, "random_sequences.txt") try: # Execute the RSAT random-seq command cmd = os.path.join(RSAT_PATH, "perl-scripts/random-seq") cmd += " -l " + str(int(max_length * 1.5)) cmd += " -n " + str(bedseq_number) cmd += " -a a:t 0.3 c:g 0.2" cmd += " -type DNA" cmd += " -format multi" cmd += " -o " + file_path Log.info( "GenerateMSAProcessor.generateMSA : starting random sequence generation. Command used is : " + cmd) # Execute the command cmd_result = commands.getstatusoutput(cmd) if cmd_result[0] != 0: Log.log( "GenerateMSAProcessor.generateMSA : status returned is :" + str(cmd_result[0]) + " for command '" + cmd + "'") Log.log( "GenerateMSAProcessor.generateMSA : command output is = \n" + str(cmd_result[1])) raise ExecutionException( "GenerateMSAProcessor.generateMSA : Cannot execute random-seq commands. See logs for more details" ) # Read the output file to get the random sequences sequence_list = [] sequence_file = open(file_path, "r") for line in sequence_file: sequence_list.append(line.split()[0]) # Generate the species list species_list = [] species_list.append(output_commstruct.baseSpecies) for index in range(msa_length - 1): species_list.append("Species" + str(index + 1)) # Create and fill the MSA for each BED sequence count_seq = 0 for chrom in output_commstruct.bedSequencesDict.keys(): for bedseq in output_commstruct.bedSequencesDict[chrom]: msa = SequenceAlignment() msa.name = bedseq.name + "_1" msa.referenceSpecies = output_commstruct.baseSpecies seq_length = bedseq.indexEnd - bedseq.indexStart sequence = list(sequence_list[count_seq][:seq_length]) for index in range(msa_length): msa.addSequence(species_list[index], sequence) #msa.addSequence( species_list[index], list(['.'] * len( sequence))) msa.finalizeSequences() output_commstruct.addSequenceAlignment(bedseq, msa) count_seq += 1 except IOError, io_exce: raise ExecutionException( "GenerateMSAProcessor.generateMSA : Unable to save/read random sequences file. From:\n\t---> " + str(io_exce))