def make_dictionary(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.path_home = Constants.PATH_HOME self.path_input_file = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY, True) self.dictionary_output_path = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY, True) self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance( ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary(self.path_input_file) self.dict_file = FileUtils.open_text_w(self.output_file_path) pickle.dump(dict_identifier, self.dict_file) Logger.get_instance().info( " The creation of a dictionary is completed \n\n")
def dictionary_identifier(self): Logger.get_instance().info( " Creation of a dictionary for novel gene of dataset 2\ The dictionary structure is : \n \ {gene = [ isoform1, isoform2,...isoformN]}") self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY, True) self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance( ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True) self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True) self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance( ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True) dict_identifier = InfoFasta.make_dictionary( self.ensembl_output_dataset2) file_dict = FileUtils.open_text_w(self.dictionary_namefile) pickle.dump(dict_identifier, file_dict) Logger.get_instance().info( " The creation of a dictionary for novel gene in dataset 2 is completed \n\n" )
def iupred_analysis(self, fastafile, prot): self.fastafile = fastafile self.prot = prot # Calling of IUPred command iupred_out = subp.check_output([self.iupred_path+"iupred", self.fastafile, "long"]) # Prediction output file ( Prediction_protname.txt) pred_file = FileUtils.open_text_w(self.path_output+'IUPred_' + self.prot + '.txt') index_prediction = iupred_out.index('Prediction output') iupred_out = iupred_out[index_prediction:] iupred_out_list = iupred_out.split('\n') new_iupred_out = [] new_iupred_out.append(iupred_out_list[0]) new_iupred_out.append(iupred_out_list[1]) for line in iupred_out_list[2:]: new_line = [] for item in line.split(' '): if item != '': new_line.append(item) new_line_string = '\t'.join(new_line) new_iupred_out.append(new_line_string) final_out = '\n'.join(new_iupred_out) pred_file.write(final_out) pred_file.close()
def split_seq(file_sequences_path, path_output, start_header, end_header): # Through the subprocess method the grep unix command gets the header of fasta file # header_dataset = subp.check_output(['grep', '>', file_sequences_path]) header = header_dataset.split('\n') file_seq = FileUtils.open_text_r(file_sequences_path) seq_obj = file_seq.readlines() for i, term in enumerate(header): prot = term[start_header:end_header] Logger.get_instance().info(str(i + 1) + ' ' + prot) # extraction of sequence from fasta file prot_seq = InfoFasta.get_seq(seq_obj, prot) # writing of sequence in a fasta file fasta_seq = SeqToFasta.give_fasta(term, prot_seq) file_out = FileUtils.open_text_w(path_output + prot + '.fasta') file_out.write(fasta_seq) file_out.close()
def anchor_analysis(self, fastafile, motifslist, prot): # Calling of anchor command # anchor out contains the anchor output in text format anchor_out = subp.check_output([ "perl", self.anchor_path + 'anchor.pl', fastafile, "-m", motifslist ]) # Definition of the section index of anchor output in order to get a specific section of anchor output # Thereby in the next step it will be possible to write these sections in some file index_bind_reg = anchor_out.index('Predicted binding regions') index_motifs = anchor_out.index('Motifs') index_pred_profile = anchor_out.index('Prediction profile output') # The Anchor output can lack filtered regions section if 'Filtered regions' in anchor_out: index_filt_reg = anchor_out.index('Filtered regions') else: index_filt_reg = index_motifs # =============================================================================== # Files writing # =============================================================================== # # Prediction binding regions file ( PBR_protname.txt) # This section selects the Prediction binding region of anchor output # The PBR section is split in lines and the '#\t' character is removed # pbr_out = anchor_out[index_bind_reg:index_filt_reg] pbr_out_list = pbr_out.split('\n') pbr = [line[2:] for line in pbr_out_list if line[0:2] == '#\t'] # # When a protein lacks predicting binding region in the output file is written "None" then # If the predicting binding regions are not in pbr_out the file writing is skipped # if 'None' in pbr_out: Logger.get_instance().info( "This protein doesn't contain predicted binding region (" + prot + ')') pass elif 'None' not in pbr_out: new_pbr_out = [line.replace(' ', '') for line in pbr] pbr_file = FileUtils.open_text_w(self.path_output + 'PBR_' + prot + '.txt') pbr_file.write('\n'.join(new_pbr_out)) pbr_file.close() # # Found Motifs file (FMotifs_protname.txt) # fmotifs_out = anchor_out[index_motifs:index_pred_profile] fmotifs_out_list = fmotifs_out.split('\n') fmotifs = [line[2:] for line in fmotifs_out_list if line[0:2] == '#\t'] # # When a protein lacks Motif in the output file is written "None" then # If the Motif are not in fmotif_out the file writing is skipped # if 'None' in fmotifs_out: Logger.get_instance().info( "This protein doesn't contain any motifs (" + prot + ')') pass elif 'None' not in pbr_out: new_fmotifs = [line.replace(' ', '') for line in fmotifs] fmotifs_file = FileUtils.open_text_w(self.path_output + 'FMotifs_' + prot + '.txt') fmotifs_file.write('\n'.join(new_fmotifs)) fmotifs_file.close() # # Prediction profile output (Pred_protname.txt) # This section is always present in anchor output # pred_file = FileUtils.open_text_w(self.path_output + 'Pred_' + prot + '.txt') pred_out = anchor_out[index_pred_profile:] string = '# Columns:\n# 1 - Amino acid number\n# 2 -\ One letter code\n# 3 - ANCHOR probability value\n# 4 - ANCHOR output\n#' pred_out = pred_out.replace(string, '') pred_out_list = pred_out.split('\n') new_pred_out = [line.replace(' ', '') for line in pred_out_list] final_out = '\n'.join(new_pred_out) pred_file.write(final_out) pred_file.close()