示例#1
0
    def _check_presence_of_digits(self, fasta_string, fasta_path=''):
        """
        Checks the presence of digits in the FASTA string.
        
        If found user is prompt if to abort or parse them out.
        
        Parameters:
            - fasta_string (str): the fasta string
            - fasta_path (opt, str): path to fasta file, serves log purposes
        
        Returns:
            FASTA without digits (user choice)
        """
        self.logger.debug('Entered check digits')

        if ''.join(c for c in fasta_string if c.isdigit()):
            msg = \
'We found digits in your FASTA string coming from file {}. Be aware of \
mistakes resulting from wrong FASTA file. You may wish to abort \
and correct the file. \
If you choose continue, Farseer-NMR will parse out the digits.'                                                               .\
                format(self.fasta_path)

            wet22 = fsw(msg_title='WARNING', msg=msg, wet_num=22)
            self.logger.info(wet22.wet)
            wet22.continue_abort()
            fasta_string_no_digit = \
                ''.join(c for c in fasta_string if not c.isdigit())

            self.logger.debug('Digits parsed out')
            return fasta_string_no_digit

        else:
            self.logger.debug('no digits found')
            return fasta_string
示例#2
0
    def __init__(self, fasta_file_path, fasta_start_num):

        # activates logging
        self.logger = Logger.FarseerLogger(__name__).setup_log()
        self.logger.debug('FastaHandler initiated')

        # FASTA file path
        if os.path.exists(fasta_file_path):
            self.fasta_path = fasta_file_path
            self.logger.debug("FASTA file path OK: {}".format(self.fasta_path))
        else:
            msg = "The path provided for the FASTA file does not exist.\
None assigned"

            wet37 = fsw(msg_title='ERROR', msg=msg, wet_num=37)
            self.logger.info(wet37.wet)
            self.fasta_path = None

        self.fasta_start_num = fasta_start_num
        self.logger.debug('FASTA start number: {}'.format(
            self.fasta_start_num))

        self.fasta_string = None
        self.fasta_df = None
示例#3
0
    def reads_fasta_to_dataframe(self,
                                 fasta_string='',
                                 atom1='H',
                                 atom2='N',
                                 details='None',
                                 reads_from_file=False,
                                 fasta_path=''):
        """
        Reads a FASTA string or a FASTA file to a structured pd.DataFrame.
        
        Parameters:
            - fasta_string (opt, str): the FASTA string, if not provided,
                reads from self.fasta_string
            - atom1 (opt, str): the atom type for assign F1 column (def: 'H').
            - atom2 (opt, str): the atom type for assign F2 column (def: 'N').
            - details (opt,str): fill Details column (def: 'None').
            - reads_from_file (opt, bool): if should read from file instead
                of from <fasta_string>.
            - fasta_path (opt, str): path to fasta file, must be given if
                reads_from_file True.
        
        Reads the FASTA string and generates a 5 column DataFrame
        with the information ready to be incorporated in the peaklists
        dataframes of FarseerSeries object.
        
        Assigns self.fasta_df
        """

        # assignes and validades arguments ###
        fasta_path = fasta_path or self.fasta_path

        if reads_from_file and fasta_string:

            msg = "A FASTA string was passed to the fasta_string parameter and \
reads_from_file is True. This is inconsistent because if a FASTA string was \
given there is no need to read from a file. <reads_from_file> will not be \
considered and the FASTA string will be used."

            wet = fsw(msg_title='NOTE', msg=msg, wet_num=37)
            self.logger.info(wet.wet)

        elif reads_from_file and not fasta_string:

            self.reads_fasta_from_file(fasta_path)
            fasta_string = self.fasta_string

        elif not fasta_string and not reads_from_file and self.fasta_string:

            fasta_string = self.fasta_string

        else:
            msg = "It was not possible to assign a FASTA string. Either it \
was not passed as argument (<fasta_string>) or is not defined as an attribute. \
Please ensure a <fasta_string> is passed or .reads_fasta_from_file() method \
is executed previously."

            wet = fsw(msg_title="WARNING", msg=msg, wet_num=37)
            self.logger.warning(wet.wet)
            return None

        self.logger.debug(fasta_string)

        # arguments assigned and validated ###

        # Generates FASTA reference dataframe
        dd = {}
        # ResNo is kept as str() to allow reindexing
        # later on the finds_missing function.
        dd["ResNo"] = \
            [str(i) for i in range(
                self.fasta_start_num,
                (self.fasta_start_num + len(fasta_string))
                )
            ]
        dd["1-letter"] = list(fasta_string)
        dd["3-letter"] = [aal1tol3[i] for i in fasta_string]
        # Assign F1 is generated here because it will serve in future functions.
        dd["Assign F1"] = \
            [str(i+j+atom1) for i, j in zip(dd["ResNo"], dd["3-letter"])]
        dd["Assign F2"] = \
            [str(i+j+atom2) for i, j in zip(dd["ResNo"], dd["3-letter"])]
        # Details set to 'None' as it is by default in CCPNMRv2 peaklists
        dd['Details'] = [details for i in fasta_string]
        self.fasta_df = pd.DataFrame(dd,
                                     columns=[
                                         'ResNo', '3-letter', '1-letter',
                                         'Assign F1', 'Assign F2', 'Details'
                                     ])
        logs = '  * {}-{}-{}'.format(self.fasta_start_num, fasta_string,
                                     dd['ResNo'][-1])
        self.logger.info(logs)

        return None
示例#4
0
def parse_ansig_peaklist(peaklist_file):
    """Parse a 2D peaklist in ANSIG format
       From ANSIG Manual:
       For 2D crosspeaks files the record has the format:
        FORMAT (3E13.6, A12, 7I6, 6A4)

        The values for each crosspeak are given in the following order:
        E13.6	Coordinates (F1, F2, ...)
        E13.6	Intensity
        A12	Spectrum name
        I6	Symmetry connection
        2I6	F1 connections (prev, next)
        2I6	F2 connections (prev, next)
            ... (further Fdim connections)
        2I6	Corresponding connections
        A4	Sequence assignments; F1, F2, ...
        A4	Residue assignemnts; F1, F2, ...
        A4	Nucleus assignments; F1, F2, ...

ANSIG v3.3 export crosspeaks file
   190     2
 1.307676E+02 8.772405E+00 8.272293E+05Trosy_highCo     0     0     0     0     0     0     023  23  Leu Leu N   HN
 1.301636E+02 8.656933E+00 4.936973E+05Trosy_highCo     0     0     0     0     0     0     0183 183 Ala Ala N   HN
 1.298941E+02 8.845919E+00 6.773006E+05Trosy_highCo     0     0     0     0     0     0     0282 282 Ala Ala N   HN
    """
    peakList = []
    # FarSeer-NMR only supports peaklists so dimension_count must equal 2
    dimension_count = 2
    # Each chemical shift is 13 characters wide and intensity

    fin = open(peaklist_file, 'r')
    lines = fin.readlines()
    fin.close()

    if lines[1].split()[-1] != '2':
        print("Peak list is not from a 2D spectrum")
        return

    counter = 1
    for ii, line in enumerate(lines[2:]):
        ls = line.strip().split()

        if line.strip().startswith('!') \
                or line.strip().startswith('ANSIG'):
            continue

        if len(ls) < 15 and len(line.split('-')) > 1:
            msg = "Line {} of peaklist {} can't be parsed".format(
                counter + 2, peaklist_file)
            wet31 = fsw(msg_title='ERROR', msg=msg, wet_num=31)
            print(wet31.wet)
            wet31.abort(m="Aborting...")

        elif len(ls) < 15:
            continue

        peak_number = counter
        positions = [ls[1], ls[0]]

        if ls[-2] == 'N' and ls[-1] == 'HN':
            atoms = ['H', 'N']
        else:
            continue

        residue_number = ls[10]
        residue_type = ls[11]
        height = ls[2].rstrip(string.ascii_letters + string.punctuation)
        volume = height
        linewidths = [0, 0]
        peak = Peak(peak_number=counter,
                    positions=positions,
                    volume=volume,
                    height=height,
                    residue_number=residue_number,
                    residue_type=residue_type,
                    linewidths=linewidths,
                    atoms=atoms,
                    format_="ansig")
        peakList.append(peak)
        counter += 1

    return peakList
def parse_user_peaklist_1(peaklist_file):
    """
    Parses a user defined CARA-derived peaklist.
    
    File extention: *.prot
    Peaklist format:
    
        1  10.494 0.000 H     238
        2 130.175 0.000 N     238
        4   9.965 0.000 H     216
        5 125.165 0.000 N     216
    
    In the current version, only H and N atoms are considered.
    
    Returns:
        a list fo Peak objects.
    """
    
    fin = open(peaklist_file, 'r')
    peakList = []
    
    current_residue = None
    count_residue = 0
    
    counter = -1
    
    eval_elements = [
        str.isdigit,
        eval_str_to_float,
        eval_str_to_float,
        str.isalpha,
        str.isdigit
        ]
    
    for line in fin:
        counter += 1
        
        ls = line.strip().split()
        
        if not ls:
            continue
        
        elif all([f(e) for e, f in zip(ls, eval_elements)]) \
                and len(ls) == 5:
            pass
        
        else:
            msg = "The peaklist {} contains a wrong line format in line {}."\
                .format(peaklist_file, counter)
            wet29 = fsw(msg_title='ERROR', msg=msg, wet_num=29)
            print(wet29.wet)
            wet29.abort()
        
        if ls[3] not in ('N', 'H'):
            continue
        
        if ls[-1] != current_residue:
            current_residue = ls[-1]
            position = [ls[1]]
            atom = [ls[3]]
            count_residue += 1
        
        elif ls[-1] == current_residue:
            position.append(ls[1])
            atom.append(ls[3])
            
            peak = Peak(
                peak_number=count_residue,
                positions=position,
                residue_number=current_residue,
                residue_type=None,
                atoms=atom,
                linewidths=[0, 0],
                volume=0,
                height=0,
                format_='user_pkl_1'
                )
            
            peakList.append(peak)
    
    fin.close()
    
    return peakList 
示例#6
0
def get_peaklist_format(file_path):
    fin = open(file_path, 'r')

    if len(file_path.split('.')) < 2:
        print('Invalid File Extension')
        return "Not accepted suffix"

    file_ext = file_path.split('.')[-1]
    if file_ext not in file_extensions:
        msg = \
"""*** The following file was not recognised as a valid peaklist
*** {}
*** suffix not in accepted formats. Accepted formats are:
*** *.peaks *.xpk *.out and *.csv (CCPNMR2)
*** visit folder Documentation/Accepted_Peaklists_Formats for more information.
*** If this file is not a peaklists, simply IGNORE this message.
""".\
            format(file_path)
        print(msg)
        #print('Invalid File Extension. Suffix not in accepted format.')
        return "Not accepted suffix"

    for line in fin:

        ls = line.strip().split()

        if not line.strip():
            continue

        elif file_ext == 'peaks' \
                and (line.lstrip().startswith("Assignment") and "w1" in line) \
                or line.startswith("<sparky save file>"):
            fin.close()
            return "SPARKY"

        elif file_ext == 'peaks' \
                and line.lstrip().startswith("ANSIG") and "crosspeak" in line:
            fin.close()
            return "ANSIG"

        elif file_ext == 'peaks' \
                and line.startswith("DATA") and "X_AXIS" in line:
            fin.close()
            return "NMRDRAW"

        elif file_ext == 'xpk' \
                and line.split()[0].isdigit() and line.split()[1].startswith('{'):
            fin.close()
            return "NMRVIEW"

        # because columns in ccpnmr peaklists may be swapped
        elif file_ext == 'csv' \
                and set(line.strip().split(',')) == ccpnmr_headers:
            fin.close()
            return "CCPNMRV2"

        elif file_path.endswith('.prot') \
                and line.strip().split()[0].isdigit() \
                and line.strip().split()[-1].isdigit() \
                and len(ls) == 5 \
                and all([f(e) for e, f in zip(ls, eval_elements_usr_pkl_1)]):

            fin.close()
            return "USER_PKL_1"

        elif file_path.endswith('.str') \
                and (line.strip() == 'loop_' \
                        or line.strip() == '_Atom_shift_assign_ID'):

            fin.close()
            return "USER_PKL_2"


        elif file_ext == 'csv' \
                and set(line.strip().rstrip(',').split(',')) == user3_headers:

            fin.close()
            return "USER_PKL_3"

        elif file_ext == 'list' and line == user4_header:
            fin.close()
            return "USER_PKL_4"

        elif file_ext == 'csv' \
                and set(line.strip().split(',')).issubset(ccpnmr_headers):
            fin.close()
            return "USER_PKL_5"

        # INSERT YOUR VALIDATION CODE HERE
        # SO THAT YOU PEAKLIST FORMAT IS RECOGNIZED
        #elif ****:
        #fin.close()
        #return "YOUR_FORMAT"

        else:
            continue

    else:
        msg = \
"""We could not read peaklist file: {}.
Mostly likely due to a bad peaklist formatting syntax.
""".\
            format(file_path)
        print(fsw(msg_title="ERROR", msg=msg, wet_num=30).wet)
        return "Bad peaklist format"