예제 #1
0
    def make_dictionary(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.path_home = Constants.PATH_HOME
        self.path_input_file = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_INPUT_FILE_PROPERTY,
                       True)

        self.dictionary_output_path = self.path_home + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_OUTPUT_PATH_PROPERTY,
                       True)
        self.output_file_path = self.dictionary_output_path + PropertyManager.get_instance(
        ).get_property(DataConstants.DOWNLOAD_DICTIONARY_FILE_OUTPUT_PROPERTY,
                       True)

        dict_identifier = InfoFasta.make_dictionary(self.path_input_file)

        self.dict_file = FileUtils.open_text_w(self.output_file_path)

        pickle.dump(dict_identifier, self.dict_file)

        Logger.get_instance().info(
            " The creation of a dictionary is completed \n\n")
예제 #2
0
    def dictionary_identifier(self):

        Logger.get_instance().info(
            " Creation of a dictionary for novel gene of dataset 2\
The dictionary structure is : \n \
{gene = [ isoform1, isoform2,...isoformN]}")

        self.ensembl_path_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_OUTPUT_PATH_SEQUENCE_PROPERTY,
                       True)
        self.ensembl_output_dataset2 = self.ensembl_path_output + PropertyManager.get_instance(
        ).get_property(DataConstants.ENSEMBL_FILE_SEQUENCES_2_PROPERTY, True)

        self.dictionary_output = Constants.PATH_HOME + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_PATH_OUTPUT_PROPERTY, True)
        self.dictionary_namefile = self.dictionary_output + PropertyManager.get_instance(
        ).get_property(DataConstants.DICTIONARY_NAME_FILE_PROPERTY, True)

        dict_identifier = InfoFasta.make_dictionary(
            self.ensembl_output_dataset2)

        file_dict = FileUtils.open_text_w(self.dictionary_namefile)

        pickle.dump(dict_identifier, file_dict)

        Logger.get_instance().info(
            " The creation of a dictionary for novel gene in dataset 2 is completed \n\n"
        )
예제 #3
0
 def iupred_analysis(self, fastafile, prot):
     
     self.fastafile = fastafile
     self.prot = prot
     
     # Calling of IUPred command
     iupred_out = subp.check_output([self.iupred_path+"iupred", self.fastafile, "long"])
     
     
     # Prediction output file ( Prediction_protname.txt)
     pred_file = FileUtils.open_text_w(self.path_output+'IUPred_' + self.prot + '.txt')
     index_prediction = iupred_out.index('Prediction output')
     iupred_out = iupred_out[index_prediction:] 
     iupred_out_list = iupred_out.split('\n')
     new_iupred_out = []
     new_iupred_out.append(iupred_out_list[0])
     new_iupred_out.append(iupred_out_list[1])
     for line in iupred_out_list[2:]:
         new_line = []
         for item in line.split(' '):
             if item != '':
                 new_line.append(item)
         new_line_string = '\t'.join(new_line)
         new_iupred_out.append(new_line_string)
             
     final_out = '\n'.join(new_iupred_out) 
         
     pred_file.write(final_out)
     pred_file.close()
예제 #4
0
    def split_seq(file_sequences_path, path_output, start_header, end_header):

        # Through the subprocess method the grep unix command gets the header of fasta file
        #
        header_dataset = subp.check_output(['grep', '>', file_sequences_path])
        header = header_dataset.split('\n')
        file_seq = FileUtils.open_text_r(file_sequences_path)
        seq_obj = file_seq.readlines()

        for i, term in enumerate(header):
            prot = term[start_header:end_header]
            Logger.get_instance().info(str(i + 1) + ' ' + prot)

            # extraction of sequence from fasta file
            prot_seq = InfoFasta.get_seq(seq_obj, prot)

            # writing of sequence in a fasta file
            fasta_seq = SeqToFasta.give_fasta(term, prot_seq)
            file_out = FileUtils.open_text_w(path_output + prot + '.fasta')
            file_out.write(fasta_seq)
        file_out.close()
예제 #5
0
    def anchor_analysis(self, fastafile, motifslist, prot):

        # Calling of anchor command
        # anchor out contains the anchor output in text format
        anchor_out = subp.check_output([
            "perl", self.anchor_path + 'anchor.pl', fastafile, "-m", motifslist
        ])

        # Definition of the section index of anchor output in order to get a specific section of anchor output
        # Thereby in the next step it will be possible to write these sections in some file
        index_bind_reg = anchor_out.index('Predicted binding regions')
        index_motifs = anchor_out.index('Motifs')
        index_pred_profile = anchor_out.index('Prediction profile output')

        # The Anchor output can lack filtered regions section
        if 'Filtered regions' in anchor_out:
            index_filt_reg = anchor_out.index('Filtered regions')
        else:
            index_filt_reg = index_motifs

        # ===============================================================================
        # Files writing
        # ===============================================================================
        #
        # Prediction binding regions file ( PBR_protname.txt)
        # This section selects the Prediction binding region of anchor output
        # The PBR section is split in lines and the '#\t' character is removed
        #
        pbr_out = anchor_out[index_bind_reg:index_filt_reg]
        pbr_out_list = pbr_out.split('\n')
        pbr = [line[2:] for line in pbr_out_list if line[0:2] == '#\t']
        #
        # When a protein lacks predicting binding region in the output file is written "None" then
        # If the predicting binding regions are not in pbr_out the file writing is skipped
        #
        if 'None' in pbr_out:
            Logger.get_instance().info(
                "This protein doesn't contain predicted binding region  (" +
                prot + ')')
            pass
        elif 'None' not in pbr_out:
            new_pbr_out = [line.replace(' ', '') for line in pbr]
            pbr_file = FileUtils.open_text_w(self.path_output + 'PBR_' + prot +
                                             '.txt')
            pbr_file.write('\n'.join(new_pbr_out))
            pbr_file.close()
        #
        # Found Motifs file (FMotifs_protname.txt)
        #
        fmotifs_out = anchor_out[index_motifs:index_pred_profile]
        fmotifs_out_list = fmotifs_out.split('\n')
        fmotifs = [line[2:] for line in fmotifs_out_list if line[0:2] == '#\t']
        #
        # When a protein lacks Motif in the output file is written "None" then
        # If the Motif are not in fmotif_out the file writing is skipped
        #
        if 'None' in fmotifs_out:
            Logger.get_instance().info(
                "This protein doesn't contain any motifs (" + prot + ')')
            pass
        elif 'None' not in pbr_out:
            new_fmotifs = [line.replace(' ', '') for line in fmotifs]
            fmotifs_file = FileUtils.open_text_w(self.path_output +
                                                 'FMotifs_' + prot + '.txt')
            fmotifs_file.write('\n'.join(new_fmotifs))
            fmotifs_file.close()
        #
        # Prediction profile output (Pred_protname.txt)
        # This section is always present in anchor output
        #
        pred_file = FileUtils.open_text_w(self.path_output + 'Pred_' + prot +
                                          '.txt')
        pred_out = anchor_out[index_pred_profile:]
        string = '#   Columns:\n#   1 - Amino acid number\n#   2 -\
 One letter code\n#   3 - ANCHOR probability value\n#   4 - ANCHOR output\n#'

        pred_out = pred_out.replace(string, '')
        pred_out_list = pred_out.split('\n')
        new_pred_out = [line.replace(' ', '') for line in pred_out_list]
        final_out = '\n'.join(new_pred_out)
        pred_file.write(final_out)
        pred_file.close()