Python FileUtils.open_text_a示例，core.util.file.FileUtils.FileUtils.open_text_a Python示例

示例#1

0

显示文件

    def make_iupred_file(input_path, output_path, th_1, th_2, num_aa, dataset_type):
        
        
        # initialization of file names
        file_name_1 = dataset_type + '_IupredTable' + '_t1_'+ str(th_1) + '_t2_' + str(th_2) + '.txt'
        file_name_2 = dataset_type + '_IupredRegion_' + str(th_1)  + '.txt'
        file_name_3 = dataset_type + '_IupredRegion_' + str(th_2)  + '.txt'
        
        
        num_aa_string = '('+ str(num_aa) +' AA)'
        
        # Files opening and title string writing
        file_1 = FileUtils.open_text_a(output_path + file_name_1)
        file_2 = FileUtils.open_text_a(output_path + file_name_2)
        file_3 = FileUtils.open_text_a(output_path + file_name_3)
        header_file_table = ['Protein', 'Fraction '+ str(th_1), 'Fraction ' + str(th_2), 'Region N.' +  num_aa_string +'th_'+ str(th_1) , 'Region N.'+ num_aa_string +'th_'+ str(th_2)]
        header_file_region = ['Protein', 'N', 'Start', 'End',  'Region length']
        
        header_string_table = '\t'.join(header_file_table)
        header_file_region = '\t'.join(header_file_region)
        
        file_1.write(header_string_table + '\n')
        file_2.write(header_file_region + '\n')
        file_3.write(header_file_region + '\n')
        
        # This command allows to taken the file names of protein that you want analyze
        list_file = subp.check_output(['ls', input_path])
        list_file = list_file.split('\n')
        if '' in list_file:
            list_file.remove('')

        # This section performs the iupred_string_info method (that calls also iupred_info method) 
        # for each protein file in list_file and simultaneously appends into files the output results
        # in a tab format
        for i, pred_file in enumerate(list_file):
            i += 1
            prot_id = pred_file.split('.')[0].split('_')[1]
            Logger.get_instance().info( str(i) + ' ' + prot_id)
            namefile = input_path + pred_file
            out_string = Iupred.iupred_string_info(namefile, prot_id, th_1, th_2, num_aa)
            
            string_file_1 = out_string[0]
            string_file_2 = out_string[1]
            string_file_3 = out_string[2]
            
            file_1.write(string_file_1 + '\n')
            file_2.write(string_file_2 + '\n')
            file_3.write(string_file_3 + '\n')
            
            
        file_1.close()
        file_2.close()
        file_3.close()

示例#2

0

显示文件

    def change_header(path_input_file, path_ouptut_file, source=1, type_id=1):

        file_input = FileUtils.open_text_r(path_input_file)
        seq_file = file_input.read()

        file_output = FileUtils.open_text_a(path_ouptut_file)

        # Warning: Check that the file have the '>' char only at beginning of header lines and not in other points
        # otherwise the split will occur in an incorrect way!
        seq_file_list = seq_file.split('>')[1:]

        for seq in seq_file_list:
            lines = seq.split('\n')
            header = lines[0]
            Logger.get_instance().info(header)
            # Ensembl
            if source == 1:
                new_header = '>' + header.split('|')[2] + '\n'  # see Note
            # Uniprot
            elif source == 2:
                diff_header = header.split(' ')[0]
                # AC
                if type_id == 1:
                    new_header = '>' + diff_header.split('|')[1] + '\n'
                # ID
                elif type_id == 2:
                    new_header = '>' + diff_header.split('|')[2] + '\n'

            fasta = new_header + '\n'.join(lines[1:])

            file_output.write(fasta)

        file_output.close()

示例#3

0

显示文件

    def isoform_sequences(self):

        Logger.get_instance().info(
            " Starting the random selection of isoforms with same length \n")
        Logger.get_instance().info(
            " The following headers are the proteins randomly selected \n")

        self.path_output_longest = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.LONGEST_PATH_OUTPUT_PROPERTY, True)

        self.path_file_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.ISOFORM_FILE_PROPERTY, True)
        self.path_file_selected_isoform = self.path_output_longest + PropertyManager.get_instance(
        ).get_property(DataConstants.RANDOM_ISOFORM_SEQ_PROPERTY, True)

        # The headers of a Isoform fasta file are taken by InfoFasta class
        # You make sure that the arg text is equal to False because the input object is a file and not a list

        self.headers = InfoFasta.get_header(self.path_file_isoform, text=False)

        # Extraction of genes form headers line
        # This vector contains double gene because the file contains some isoform of the same gene

        gene_isoform = []
        for header in self.headers:
            gene = header[1:16]
            gene_isoform.append(gene)

        # gene set creation
        unique_gene = set(gene_isoform)

        # This for loop flows on the unique gene
        #
        random_header = []
        old_num_isoform = 0
        for gene in unique_gene:
            # For each gene counts how many isoform has
            num_isoform = gene_isoform.count(gene)
            item = range(0, num_isoform)
            # Select one isoform randomly
            sel = random.choice(item)
            # The header selected randomly are stored in array
            random_header.append(self.headers[old_num_isoform:old_num_isoform +
                                              num_isoform][sel])
            old_num_isoform = old_num_isoform + num_isoform

        self.file_random_seq = FileUtils.open_text_a(
            self.path_file_selected_isoform)

        # The sequences corresponding to header selected are extracted from isoform file

        for header in random_header:
            Logger.get_instance().info('Header selected : ' + header)
            identifier = header[33:48]
            sequence = InfoFasta.get_seq(self.path_file_isoform, identifier)
            fasta_seq = SeqToFasta.give_fasta(header, sequence)
            self.file_random_seq.write(fasta_seq)

        Logger.get_instance().info(" End of selection random sequences \n ")

示例#4

0

显示文件

    def make_disordp_file(input_path, output_path, binding_partner, num_aa,
                          dataset_type):

        # initialization of file names
        file_name_1 = dataset_type + '_DisoRDPbindTable.txt'
        file_name_2 = dataset_type + '_DisoRDPbindRegion.txt'

        num_aa_string = '(' + str(num_aa) + ' AA)'

        # Files opening and title string writing
        file_1 = FileUtils.open_text_a(output_path + file_name_1)
        file_2 = FileUtils.open_text_a(output_path + file_name_2)
        header_file_table = [
            'Protein', 'Fraction ', 'Region N.' + num_aa_string
        ]
        header_file_region = ['Protein', 'N', 'Start', 'End', 'Region length']

        header_string_table = '\t'.join(header_file_table)
        header_file_region = '\t'.join(header_file_region)

        file_1.write(header_string_table + '\n')
        file_2.write(header_file_region + '\n')

        # Reading of DisoRDPbind output file
        protein_output_list = DisoRDPbind.output_reading(input_path)

        for n, output in enumerate(protein_output_list):
            if 'WARNING:' in output:
                prot = output.split('\n')[0]
                Logger.get_instance().warning(
                    str(n + 1) + "\n This protein contains >=10000 residues\
 (DisoRBDbind cannot predict the proteins with size >=10000) " + prot)
            else:
                Logger.get_instance().info(str(n + 1))
                results = DisoRDPbind.disordp_string_info(
                    output, binding_partner, num_aa)

                string_file_1 = results[0]
                string_file_2 = results[1]

                file_1.write(string_file_1 + '\n')
                file_2.write(string_file_2 + '\n')

        file_1.close()
        file_2.close()

示例#5

0

显示文件

 def merge_file(namefile_1, namefile_2, new_namefile):
     file1 = FileUtils.open_text_r(namefile_1)
     file2 = FileUtils.open_text_r(namefile_2)
     new_file = FileUtils.open_text_a(new_namefile)
     text_1 = file1.read()
     text_2 = file2.read()
     new_file.write(text_1)
     new_file.write(text_2)
     new_file.close()

示例#6

0

显示文件

    def get_list_seq(path_input_list, path_output):

        seq_file = FileUtils.open_text_a(path_output)

        protein_list = FileParser.read_file(path_input_list)
        for protein in protein_list:
            seq = Uniprot.get_sequence(protein, format_out=True)
            seq_file.write(seq)

        seq_file.close()

示例#7

0

显示文件

    def longest_seq(seq_obj,
                    dict_identifier,
                    path_outfile_longest,
                    path_outfile_isoform,
                    type_obj='list'):

        if type_obj == 'file':
            type_text = False
        elif type_obj == 'list':
            type_text = True

        fileout_longest = FileUtils.open_text_a(path_outfile_longest)
        fileout_isoform = FileUtils.open_text_a(path_outfile_isoform)
        file_dict = open(dict_identifier, 'r')
        dict_ids = pickle.load(file_dict)

        # Possible conditions:
        #
        # 1) the gene has one longest protein
        #     - in this case this seq is added to longest file
        # 2) the gene has two protein with the same length
        #    a) the sequences are identical
        #        - in this case one of these identical sequences is added to longest file
        #    b) the sequences are different
        #        - in this case the isoform sequences are added to isoform file
        # 3) the gene has more than two protein with the same length
        #    a) the sequences are identical
        #        - in this case one of these is added to longest file
        #    b) the sequences are not identical
        #        - the different isoform are added to isoform file
        #
        # count variables have been initialized in order to check the output during the method elaboration
        #
        seq_count = 0
        double_seq_count = 0
        not_same_seq_count = 0
        same_seq_count = 0
        more_prot_count = 0
        prot_longest = []
        prot_double_lseq = []
        prot_double_prot = []
        more_two_prot = []
        more_two_lseq = []
        y = 0
        #
        # This for loop flows on the keys of dictionary
        for gene in dict_ids:
            y = y + 1
            Logger.get_instance().info(str(y) + ' Gene analysed : ' + gene)
            seqs = []  # will contain the isoform list of gene selected
            lenseq = []  # will contain the length of each isoform seq
            headers = []  # will contain the header of each isoform seq

            # this for loop flows on the isoforms of gene selected
            for prot in dict_ids[gene]:

                # This lines call InfoFasta class in order to extract
                # the seq, the length and the header of protein selected
                # all item are memorized in lists
                lenseq.append(InfoFasta.get_length(seq_obj, prot))
                seqs.append(InfoFasta.get_seq(seq_obj, prot, text=type_text))
                headers.append(
                    InfoFasta.get_header(seq_obj,
                                         header_identifier=prot,
                                         text=type_text))

            # Find the max length among the sequences
            # the index_max list contains the index in correspondence of sequence with max length
            len_max = max(lenseq)
            index_max = [
                item for item in range(len(lenseq)) if lenseq[item] == len_max
            ]
            #
            # The following if conditions check the length of index_max vector
            #
            # Condition 1)
            # -------------
            # if the length of index_max vector is equal to 1 it means that there is just one longest protein
            # the protein sequence is written into longest file
            #
            if len(index_max) == 1:
                Logger.get_instance().info(' If condition 1')
                seq_count += 1
                seq = SeqToFasta.give_fasta(
                    headers[index_max[0]],
                    seqs[index_max[0]])  # (See NOTE above)
                fileout_longest.write(seq)
                prot_longest.append(dict_ids[gene][index_max[0]])
            #
            # Condition 2)
            # -------------
            # if length of index_max is equal to 2 it means that there are two protein with same length
            #
            elif len(index_max) == 2:
                Logger.get_instance().info('If condition 2')
                double_seq_count += 1
                # Condition 2a
                # -------------
                # The proteins have the same sequences
                # One protein sequence is written into longest file
                if seqs[index_max[0]] == seqs[index_max[1]]:
                    Logger.get_instance().info('2a')
                    same_seq_count += 1
                    d_seq = SeqToFasta.give_fasta(
                        headers[index_max[0]],
                        seqs[index_max[0]])  # (See NOTE above)
                    fileout_longest.write(d_seq)
                # Condition 2b
                # -------------
                # The protein have different sequences
                # The sequences are written into isoform file
                else:
                    Logger.get_instance().info('2b')
                    not_same_seq_count += 1
                    for i in range(len(index_max)):
                        prot_double_lseq.append(seqs[index_max[i]])
                        prot_double_prot.append(dict_ids[gene][index_max[i]])
                        diff_seq = SeqToFasta.give_fasta(
                            headers[index_max[i]],
                            seqs[index_max[i]])  # (See NOTE above)
                        fileout_isoform.write(diff_seq)

            # Condition 3)
            # -------------
            # if the length of index_max is greater than two it means that there are more than two proteins
            # with same length
            else:
                more_prot_count += 1
                Logger.get_instance().info(' If condition 3')

                # Condition 3a
                # -------------
                # The isoforms with same length have actually the same sequences
                # One of this protein is written in longest file
                if seqs.count(seqs[index_max[0]]) == len(index_max):
                    Logger.get_instance().info('3a')
                    m_seq = SeqToFasta.give_fasta(
                        headers[index_max[0]],
                        seqs[index_max[0]])  # (See NOTE above)
                    fileout_longest.write(m_seq)

                # Condition 3b
                # -------------
                # Among the isoforms there are at least two isoforms with different sequences
                #
                else:
                    Logger.get_instance().info('3b')
                    more_two_prot.append(gene)
                    more_two_seqs = [
                    ]  # will contains only the sequences with max length
                    for n in index_max:
                        more_two_seqs.append(seqs[n])
                    more_two_lseq.append(list(set(more_two_seqs)))
                    for seq in set(
                            more_two_seqs
                    ):  # set(more_two_seqs) contains only the different sequences
                        # find the sequence index in the list of sequences
                        index_seq = seqs.index(seq)
                        mdiff_seq = SeqToFasta.give_fasta(
                            headers[index_seq],
                            seqs[index_seq])  # (See NOTE above)
                        fileout_isoform.write(mdiff_seq)
        fileout_isoform.close()
        fileout_longest.close()

示例#8

0

显示文件

    def list_get_seq(path_input,
                     type_query,
                     path_protein_list=None,
                     path_output=None):

        # the input file is read
        list_item = FileParser.read_file(path_input)
        dict_query = {1: 'all', 2: 'one'}
        count_duplicate_genes = 0
        all_seqs = ''
        prot_seq = []
        # For each gene in list the sequences are downloaded
        for i, item in enumerate(list_item):
            Logger.get_instance().info(
                str(i + 1) + ' Extraction of gene sequence(s) : ' + item)
            fasta_seq = Ensembl.get_sequence(item, dict_query[type_query])
            if fasta_seq == item + ' No available':
                pass
            elif fasta_seq == item + ' pseudogene':
                pass

            # If the gene have a sequence the output is memorized in seqs
            else:
                seqs = fasta_seq
                seqs = seqs + '\n'
                if path_protein_list == None:
                    pass

                # if path_protein_list is different to None
                # Among the isoform of gene will be get only that is present in list_protein
                else:
                    list_protein = FileParser.read_file(path_protein_list)
                    prot_genes = seqs.split('>')
                    protein_seq = [
                        '>' + fasta for fasta in prot_genes
                        if fasta[32:47] in list_protein
                    ]
                #
                # if path_output == None the information are stored in list o string
                #
                if path_output == None:
                    if path_protein_list == None:
                        all_seqs += seqs
                    else:
                        prot_seq.append(protein_seq)
                #
                # if path_output != None
                # the information will be appended in a file
                else:
                    file_out = FileUtils.open_text_a(path_output)
                    if path_protein_list == None:
                        file_out.write(seqs)
                    else:
                        if protein_seq == []:
                            count_duplicate_genes += 1
                            Logger.get_instance().info(
                                " Number of duplicated genes: " +
                                str(count_duplicate_genes))
                            Logger.get_instance().info(
                                " The gene duplicated is: " + str(item) + '|' +
                                str(list_protein[i]))
                        else:
                            file_out.write(protein_seq[0])

        # return information like string or list

        if path_output == None:
            if path_protein_list == None:
                all_seqs += seqs
                return seqs
            else:
                return protein_seq
        else:
            file_out.close()

示例#9

0

显示文件

    def rna_target(self):

        self.path_home = Constants.PATH_HOME
        self.file_seq_natrevgenetics = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVERNA_NATREVGENETICS_SEQ_PROPERTY,
                       True)
        self.natrevgenetics_info = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_NATREVGENETICS_INFO_PROPERTY,
                       True)

        info_table = FileParser.make_table(self.natrevgenetics_info,
                                           '\t',
                                           skip=1)

        prot_info = TableWrapper.get_column(info_table, 1)
        putative_rna = TableWrapper.get_column(info_table, 3)

        self.header = InfoFasta.get_header(self.file_seq_natrevgenetics,
                                           text=False)

        gene_seq = [item.split('>')[1].split('|')[0] for item in self.header]
        prot_seq = [item.split('>')[1].split('|')[2] for item in self.header]

        # Creation of Table containing gene id, prot id and rna target

        putative_rna_target = []
        type_rna_target = []
        for n, prot in enumerate(prot_seq):
            Logger.get_instance().info(prot)
            index_prot = prot_info.index(prot)
            rna_target = putative_rna[index_prot]
            row = [gene_seq[n], prot_seq[n], rna_target]
            type_rna_target.append(rna_target)
            putative_rna_target.append(row)
            Logger.get_instance().info(" The putative rna target is " +
                                       rna_target)

        self.file_all_rna_target = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_ALL_RNA_TARGET_PROPERTY, True)

        FileWriter.write_table(self.file_all_rna_target,
                               putative_rna_target,
                               symbol='\t')

        # set of RNA target type in order to create different list
        #
        unique_rna_target = set(type_rna_target)

        info_new_table = FileParser.make_table(self.file_all_rna_target, '\t')

        # Columns extraction
        prot_name = TableWrapper.get_column(info_new_table, 1)
        type_rnatarget = TableWrapper.get_column(info_new_table, 2)

        file_output = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.PUTATIVE_RNA_OUTPUT_PROPERTY, True)

        # this for loop allows to create a proteins files for each RNA target type
        for item in unique_rna_target:
            file_name = file_output + item + PropertyManager.get_instance(
            ).get_property(
                DataConstants.PUTATIVE_RNA_TARGET_DATASET_NAME_PROPERTY, True)
            file_rna = FileUtils.open_text_a(file_name)
            for n, type_rna in enumerate(type_rnatarget):
                if type_rna == item:
                    file_rna.write(prot_name[n])

            file_rna.close()