def make_iupred_file(input_path, output_path, th_1, th_2, num_aa, dataset_type): # initialization of file names file_name_1 = dataset_type + '_IupredTable' + '_t1_'+ str(th_1) + '_t2_' + str(th_2) + '.txt' file_name_2 = dataset_type + '_IupredRegion_' + str(th_1) + '.txt' file_name_3 = dataset_type + '_IupredRegion_' + str(th_2) + '.txt' num_aa_string = '('+ str(num_aa) +' AA)' # Files opening and title string writing file_1 = FileUtils.open_text_a(output_path + file_name_1) file_2 = FileUtils.open_text_a(output_path + file_name_2) file_3 = FileUtils.open_text_a(output_path + file_name_3) header_file_table = ['Protein', 'Fraction '+ str(th_1), 'Fraction ' + str(th_2), 'Region N.' + num_aa_string +'th_'+ str(th_1) , 'Region N.'+ num_aa_string +'th_'+ str(th_2)] header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length'] header_string_table = '\t'.join(header_file_table) header_file_region = '\t'.join(header_file_region) file_1.write(header_string_table + '\n') file_2.write(header_file_region + '\n') file_3.write(header_file_region + '\n') # This command allows to taken the file names of protein that you want analyze list_file = subp.check_output(['ls', input_path]) list_file = list_file.split('\n') if '' in list_file: list_file.remove('') # This section performs the iupred_string_info method (that calls also iupred_info method) # for each protein file in list_file and simultaneously appends into files the output results # in a tab format for i, pred_file in enumerate(list_file): i += 1 prot_id = pred_file.split('.')[0].split('_')[1] Logger.get_instance().info( str(i) + ' ' + prot_id) namefile = input_path + pred_file out_string = Iupred.iupred_string_info(namefile, prot_id, th_1, th_2, num_aa) string_file_1 = out_string[0] string_file_2 = out_string[1] string_file_3 = out_string[2] file_1.write(string_file_1 + '\n') file_2.write(string_file_2 + '\n') file_3.write(string_file_3 + '\n') file_1.close() file_2.close() file_3.close()
def change_header(path_input_file, path_ouptut_file, source=1, type_id=1): file_input = FileUtils.open_text_r(path_input_file) seq_file = file_input.read() file_output = FileUtils.open_text_a(path_ouptut_file) # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points # otherwise the split will occur in an incorrect way! seq_file_list = seq_file.split('>')[1:] for seq in seq_file_list: lines = seq.split('\n') header = lines[0] Logger.get_instance().info(header) # Ensembl if source == 1: new_header = '>' + header.split('|')[2] + '\n' # see Note # Uniprot elif source == 2: diff_header = header.split(' ')[0] # AC if type_id == 1: new_header = '>' + diff_header.split('|')[1] + '\n' # ID elif type_id == 2: new_header = '>' + diff_header.split('|')[2] + '\n' fasta = new_header + '\n'.join(lines[1:]) file_output.write(fasta) file_output.close()
def isoform_sequences(self): Logger.get_instance().info( " Starting the random selection of isoforms with same length \n") Logger.get_instance().info( " The following headers are the proteins randomly selected \n") self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance( ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True) self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True) self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance( ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True) # The headers of a Isoform fasta file are taken by InfoFasta class # You make sure that the arg text is equal to False because the input object is a file and not a list self.headers = InfoFasta.get_header(self.path_file_isoform, text=False) # Extraction of genes form headers line # This vector contains double gene because the file contains some isoform of the same gene gene_isoform = [] for header in self.headers: gene = header[1:16] gene_isoform.append(gene) # gene set creation unique_gene = set(gene_isoform) # This for loop flows on the unique gene # random_header = [] old_num_isoform = 0 for gene in unique_gene: # For each gene counts how many isoform has num_isoform = gene_isoform.count(gene) item = range(0, num_isoform) # Select one isoform randomly sel = random.choice(item) # The header selected randomly are stored in array random_header.append(self.headers[old_num_isoform:old_num_isoform + num_isoform][sel]) old_num_isoform = old_num_isoform + num_isoform self.file_random_seq = FileUtils.open_text_a( self.path_file_selected_isoform) # The sequences corresponding to header selected are extracted from isoform file for header in random_header: Logger.get_instance().info('Header selected : ' + header) identifier = header[33:48] sequence = InfoFasta.get_seq(self.path_file_isoform, identifier) fasta_seq = SeqToFasta.give_fasta(header, sequence) self.file_random_seq.write(fasta_seq) Logger.get_instance().info(" End of selection random sequences \n ")
def make_disordp_file(input_path, output_path, binding_partner, num_aa, dataset_type): # initialization of file names file_name_1 = dataset_type + '_DisoRDPbindTable.txt' file_name_2 = dataset_type + '_DisoRDPbindRegion.txt' num_aa_string = '(' + str(num_aa) + ' AA)' # Files opening and title string writing file_1 = FileUtils.open_text_a(output_path + file_name_1) file_2 = FileUtils.open_text_a(output_path + file_name_2) header_file_table = [ 'Protein', 'Fraction ', 'Region N.' + num_aa_string ] header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length'] header_string_table = '\t'.join(header_file_table) header_file_region = '\t'.join(header_file_region) file_1.write(header_string_table + '\n') file_2.write(header_file_region + '\n') # Reading of DisoRDPbind output file protein_output_list = DisoRDPbind.output_reading(input_path) for n, output in enumerate(protein_output_list): if 'WARNING:' in output: prot = output.split('\n')[0] Logger.get_instance().warning( str(n + 1) + "\n This protein contains >=10000 residues\ (DisoRBDbind cannot predict the proteins with size >=10000) " + prot) else: Logger.get_instance().info(str(n + 1)) results = DisoRDPbind.disordp_string_info( output, binding_partner, num_aa) string_file_1 = results[0] string_file_2 = results[1] file_1.write(string_file_1 + '\n') file_2.write(string_file_2 + '\n') file_1.close() file_2.close()
def merge_file(namefile_1, namefile_2, new_namefile): file1 = FileUtils.open_text_r(namefile_1) file2 = FileUtils.open_text_r(namefile_2) new_file = FileUtils.open_text_a(new_namefile) text_1 = file1.read() text_2 = file2.read() new_file.write(text_1) new_file.write(text_2) new_file.close()
def get_list_seq(path_input_list, path_output): seq_file = FileUtils.open_text_a(path_output) protein_list = FileParser.read_file(path_input_list) for protein in protein_list: seq = Uniprot.get_sequence(protein, format_out=True) seq_file.write(seq) seq_file.close()
def longest_seq(seq_obj, dict_identifier, path_outfile_longest, path_outfile_isoform, type_obj='list'): if type_obj == 'file': type_text = False elif type_obj == 'list': type_text = True fileout_longest = FileUtils.open_text_a(path_outfile_longest) fileout_isoform = FileUtils.open_text_a(path_outfile_isoform) file_dict = open(dict_identifier, 'r') dict_ids = pickle.load(file_dict) # Possible conditions: # # 1) the gene has one longest protein # - in this case this seq is added to longest file # 2) the gene has two protein with the same length # a) the sequences are identical # - in this case one of these identical sequences is added to longest file # b) the sequences are different # - in this case the isoform sequences are added to isoform file # 3) the gene has more than two protein with the same length # a) the sequences are identical # - in this case one of these is added to longest file # b) the sequences are not identical # - the different isoform are added to isoform file # # count variables have been initialized in order to check the output during the method elaboration # seq_count = 0 double_seq_count = 0 not_same_seq_count = 0 same_seq_count = 0 more_prot_count = 0 prot_longest = [] prot_double_lseq = [] prot_double_prot = [] more_two_prot = [] more_two_lseq = [] y = 0 # # This for loop flows on the keys of dictionary for gene in dict_ids: y = y + 1 Logger.get_instance().info(str(y) + ' Gene analysed : ' + gene) seqs = [] # will contain the isoform list of gene selected lenseq = [] # will contain the length of each isoform seq headers = [] # will contain the header of each isoform seq # this for loop flows on the isoforms of gene selected for prot in dict_ids[gene]: # This lines call InfoFasta class in order to extract # the seq, the length and the header of protein selected # all item are memorized in lists lenseq.append(InfoFasta.get_length(seq_obj, prot)) seqs.append(InfoFasta.get_seq(seq_obj, prot, text=type_text)) headers.append( InfoFasta.get_header(seq_obj, header_identifier=prot, text=type_text)) # Find the max length among the sequences # the index_max list contains the index in correspondence of sequence with max length len_max = max(lenseq) index_max = [ item for item in range(len(lenseq)) if lenseq[item] == len_max ] # # The following if conditions check the length of index_max vector # # Condition 1) # ------------- # if the length of index_max vector is equal to 1 it means that there is just one longest protein # the protein sequence is written into longest file # if len(index_max) == 1: Logger.get_instance().info(' If condition 1') seq_count += 1 seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(seq) prot_longest.append(dict_ids[gene][index_max[0]]) # # Condition 2) # ------------- # if length of index_max is equal to 2 it means that there are two protein with same length # elif len(index_max) == 2: Logger.get_instance().info('If condition 2') double_seq_count += 1 # Condition 2a # ------------- # The proteins have the same sequences # One protein sequence is written into longest file if seqs[index_max[0]] == seqs[index_max[1]]: Logger.get_instance().info('2a') same_seq_count += 1 d_seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(d_seq) # Condition 2b # ------------- # The protein have different sequences # The sequences are written into isoform file else: Logger.get_instance().info('2b') not_same_seq_count += 1 for i in range(len(index_max)): prot_double_lseq.append(seqs[index_max[i]]) prot_double_prot.append(dict_ids[gene][index_max[i]]) diff_seq = SeqToFasta.give_fasta( headers[index_max[i]], seqs[index_max[i]]) # (See NOTE above) fileout_isoform.write(diff_seq) # Condition 3) # ------------- # if the length of index_max is greater than two it means that there are more than two proteins # with same length else: more_prot_count += 1 Logger.get_instance().info(' If condition 3') # Condition 3a # ------------- # The isoforms with same length have actually the same sequences # One of this protein is written in longest file if seqs.count(seqs[index_max[0]]) == len(index_max): Logger.get_instance().info('3a') m_seq = SeqToFasta.give_fasta( headers[index_max[0]], seqs[index_max[0]]) # (See NOTE above) fileout_longest.write(m_seq) # Condition 3b # ------------- # Among the isoforms there are at least two isoforms with different sequences # else: Logger.get_instance().info('3b') more_two_prot.append(gene) more_two_seqs = [ ] # will contains only the sequences with max length for n in index_max: more_two_seqs.append(seqs[n]) more_two_lseq.append(list(set(more_two_seqs))) for seq in set( more_two_seqs ): # set(more_two_seqs) contains only the different sequences # find the sequence index in the list of sequences index_seq = seqs.index(seq) mdiff_seq = SeqToFasta.give_fasta( headers[index_seq], seqs[index_seq]) # (See NOTE above) fileout_isoform.write(mdiff_seq) fileout_isoform.close() fileout_longest.close()
def list_get_seq(path_input, type_query, path_protein_list=None, path_output=None): # the input file is read list_item = FileParser.read_file(path_input) dict_query = {1: 'all', 2: 'one'} count_duplicate_genes = 0 all_seqs = '' prot_seq = [] # For each gene in list the sequences are downloaded for i, item in enumerate(list_item): Logger.get_instance().info( str(i + 1) + ' Extraction of gene sequence(s) : ' + item) fasta_seq = Ensembl.get_sequence(item, dict_query[type_query]) if fasta_seq == item + ' No available': pass elif fasta_seq == item + ' pseudogene': pass # If the gene have a sequence the output is memorized in seqs else: seqs = fasta_seq seqs = seqs + '\n' if path_protein_list == None: pass # if path_protein_list is different to None # Among the isoform of gene will be get only that is present in list_protein else: list_protein = FileParser.read_file(path_protein_list) prot_genes = seqs.split('>') protein_seq = [ '>' + fasta for fasta in prot_genes if fasta[32:47] in list_protein ] # # if path_output == None the information are stored in list o string # if path_output == None: if path_protein_list == None: all_seqs += seqs else: prot_seq.append(protein_seq) # # if path_output != None # the information will be appended in a file else: file_out = FileUtils.open_text_a(path_output) if path_protein_list == None: file_out.write(seqs) else: if protein_seq == []: count_duplicate_genes += 1 Logger.get_instance().info( " Number of duplicated genes: " + str(count_duplicate_genes)) Logger.get_instance().info( " The gene duplicated is: " + str(item) + '|' + str(list_protein[i])) else: file_out.write(protein_seq[0]) # return information like string or list if path_output == None: if path_protein_list == None: all_seqs += seqs return seqs else: return protein_seq else: file_out.close()
def rna_target(self): self.path_home = Constants.PATH_HOME self.file_seq_natrevgenetics = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVERNA_NATREVGENETICS_SEQ_PROPERTY, True) self.natrevgenetics_info = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_RNA_NATREVGENETICS_INFO_PROPERTY, True) info_table = FileParser.make_table(self.natrevgenetics_info, '\t', skip=1) prot_info = TableWrapper.get_column(info_table, 1) putative_rna = TableWrapper.get_column(info_table, 3) self.header = InfoFasta.get_header(self.file_seq_natrevgenetics, text=False) gene_seq = [item.split('>')[1].split('|')[0] for item in self.header] prot_seq = [item.split('>')[1].split('|')[2] for item in self.header] # Creation of Table containing gene id, prot id and rna target putative_rna_target = [] type_rna_target = [] for n, prot in enumerate(prot_seq): Logger.get_instance().info(prot) index_prot = prot_info.index(prot) rna_target = putative_rna[index_prot] row = [gene_seq[n], prot_seq[n], rna_target] type_rna_target.append(rna_target) putative_rna_target.append(row) Logger.get_instance().info(" The putative rna target is " + rna_target) self.file_all_rna_target = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_ALL_RNA_TARGET_PROPERTY, True) FileWriter.write_table(self.file_all_rna_target, putative_rna_target, symbol='\t') # set of RNA target type in order to create different list # unique_rna_target = set(type_rna_target) info_new_table = FileParser.make_table(self.file_all_rna_target, '\t') # Columns extraction prot_name = TableWrapper.get_column(info_new_table, 1) type_rnatarget = TableWrapper.get_column(info_new_table, 2) file_output = self.path_home + PropertyManager.get_instance( ).get_property(DataConstants.PUTATIVE_RNA_OUTPUT_PROPERTY, True) # this for loop allows to create a proteins files for each RNA target type for item in unique_rna_target: file_name = file_output + item + PropertyManager.get_instance( ).get_property( DataConstants.PUTATIVE_RNA_TARGET_DATASET_NAME_PROPERTY, True) file_rna = FileUtils.open_text_a(file_name) for n, type_rna in enumerate(type_rnatarget): if type_rna == item: file_rna.write(prot_name[n]) file_rna.close()