def _check_presence_of_digits(self, fasta_string, fasta_path=''): """ Checks the presence of digits in the FASTA string. If found user is prompt if to abort or parse them out. Parameters: - fasta_string (str): the fasta string - fasta_path (opt, str): path to fasta file, serves log purposes Returns: FASTA without digits (user choice) """ self.logger.debug('Entered check digits') if ''.join(c for c in fasta_string if c.isdigit()): msg = \ 'We found digits in your FASTA string coming from file {}. Be aware of \ mistakes resulting from wrong FASTA file. You may wish to abort \ and correct the file. \ If you choose continue, Farseer-NMR will parse out the digits.' .\ format(self.fasta_path) wet22 = fsw(msg_title='WARNING', msg=msg, wet_num=22) self.logger.info(wet22.wet) wet22.continue_abort() fasta_string_no_digit = \ ''.join(c for c in fasta_string if not c.isdigit()) self.logger.debug('Digits parsed out') return fasta_string_no_digit else: self.logger.debug('no digits found') return fasta_string
def __init__(self, fasta_file_path, fasta_start_num): # activates logging self.logger = Logger.FarseerLogger(__name__).setup_log() self.logger.debug('FastaHandler initiated') # FASTA file path if os.path.exists(fasta_file_path): self.fasta_path = fasta_file_path self.logger.debug("FASTA file path OK: {}".format(self.fasta_path)) else: msg = "The path provided for the FASTA file does not exist.\ None assigned" wet37 = fsw(msg_title='ERROR', msg=msg, wet_num=37) self.logger.info(wet37.wet) self.fasta_path = None self.fasta_start_num = fasta_start_num self.logger.debug('FASTA start number: {}'.format( self.fasta_start_num)) self.fasta_string = None self.fasta_df = None
def reads_fasta_to_dataframe(self, fasta_string='', atom1='H', atom2='N', details='None', reads_from_file=False, fasta_path=''): """ Reads a FASTA string or a FASTA file to a structured pd.DataFrame. Parameters: - fasta_string (opt, str): the FASTA string, if not provided, reads from self.fasta_string - atom1 (opt, str): the atom type for assign F1 column (def: 'H'). - atom2 (opt, str): the atom type for assign F2 column (def: 'N'). - details (opt,str): fill Details column (def: 'None'). - reads_from_file (opt, bool): if should read from file instead of from <fasta_string>. - fasta_path (opt, str): path to fasta file, must be given if reads_from_file True. Reads the FASTA string and generates a 5 column DataFrame with the information ready to be incorporated in the peaklists dataframes of FarseerSeries object. Assigns self.fasta_df """ # assignes and validades arguments ### fasta_path = fasta_path or self.fasta_path if reads_from_file and fasta_string: msg = "A FASTA string was passed to the fasta_string parameter and \ reads_from_file is True. This is inconsistent because if a FASTA string was \ given there is no need to read from a file. <reads_from_file> will not be \ considered and the FASTA string will be used." wet = fsw(msg_title='NOTE', msg=msg, wet_num=37) self.logger.info(wet.wet) elif reads_from_file and not fasta_string: self.reads_fasta_from_file(fasta_path) fasta_string = self.fasta_string elif not fasta_string and not reads_from_file and self.fasta_string: fasta_string = self.fasta_string else: msg = "It was not possible to assign a FASTA string. Either it \ was not passed as argument (<fasta_string>) or is not defined as an attribute. \ Please ensure a <fasta_string> is passed or .reads_fasta_from_file() method \ is executed previously." wet = fsw(msg_title="WARNING", msg=msg, wet_num=37) self.logger.warning(wet.wet) return None self.logger.debug(fasta_string) # arguments assigned and validated ### # Generates FASTA reference dataframe dd = {} # ResNo is kept as str() to allow reindexing # later on the finds_missing function. dd["ResNo"] = \ [str(i) for i in range( self.fasta_start_num, (self.fasta_start_num + len(fasta_string)) ) ] dd["1-letter"] = list(fasta_string) dd["3-letter"] = [aal1tol3[i] for i in fasta_string] # Assign F1 is generated here because it will serve in future functions. dd["Assign F1"] = \ [str(i+j+atom1) for i, j in zip(dd["ResNo"], dd["3-letter"])] dd["Assign F2"] = \ [str(i+j+atom2) for i, j in zip(dd["ResNo"], dd["3-letter"])] # Details set to 'None' as it is by default in CCPNMRv2 peaklists dd['Details'] = [details for i in fasta_string] self.fasta_df = pd.DataFrame(dd, columns=[ 'ResNo', '3-letter', '1-letter', 'Assign F1', 'Assign F2', 'Details' ]) logs = ' * {}-{}-{}'.format(self.fasta_start_num, fasta_string, dd['ResNo'][-1]) self.logger.info(logs) return None
def parse_ansig_peaklist(peaklist_file): """Parse a 2D peaklist in ANSIG format From ANSIG Manual: For 2D crosspeaks files the record has the format: FORMAT (3E13.6, A12, 7I6, 6A4) The values for each crosspeak are given in the following order: E13.6 Coordinates (F1, F2, ...) E13.6 Intensity A12 Spectrum name I6 Symmetry connection 2I6 F1 connections (prev, next) 2I6 F2 connections (prev, next) ... (further Fdim connections) 2I6 Corresponding connections A4 Sequence assignments; F1, F2, ... A4 Residue assignemnts; F1, F2, ... A4 Nucleus assignments; F1, F2, ... ANSIG v3.3 export crosspeaks file 190 2 1.307676E+02 8.772405E+00 8.272293E+05Trosy_highCo 0 0 0 0 0 0 023 23 Leu Leu N HN 1.301636E+02 8.656933E+00 4.936973E+05Trosy_highCo 0 0 0 0 0 0 0183 183 Ala Ala N HN 1.298941E+02 8.845919E+00 6.773006E+05Trosy_highCo 0 0 0 0 0 0 0282 282 Ala Ala N HN """ peakList = [] # FarSeer-NMR only supports peaklists so dimension_count must equal 2 dimension_count = 2 # Each chemical shift is 13 characters wide and intensity fin = open(peaklist_file, 'r') lines = fin.readlines() fin.close() if lines[1].split()[-1] != '2': print("Peak list is not from a 2D spectrum") return counter = 1 for ii, line in enumerate(lines[2:]): ls = line.strip().split() if line.strip().startswith('!') \ or line.strip().startswith('ANSIG'): continue if len(ls) < 15 and len(line.split('-')) > 1: msg = "Line {} of peaklist {} can't be parsed".format( counter + 2, peaklist_file) wet31 = fsw(msg_title='ERROR', msg=msg, wet_num=31) print(wet31.wet) wet31.abort(m="Aborting...") elif len(ls) < 15: continue peak_number = counter positions = [ls[1], ls[0]] if ls[-2] == 'N' and ls[-1] == 'HN': atoms = ['H', 'N'] else: continue residue_number = ls[10] residue_type = ls[11] height = ls[2].rstrip(string.ascii_letters + string.punctuation) volume = height linewidths = [0, 0] peak = Peak(peak_number=counter, positions=positions, volume=volume, height=height, residue_number=residue_number, residue_type=residue_type, linewidths=linewidths, atoms=atoms, format_="ansig") peakList.append(peak) counter += 1 return peakList
def parse_user_peaklist_1(peaklist_file): """ Parses a user defined CARA-derived peaklist. File extention: *.prot Peaklist format: 1 10.494 0.000 H 238 2 130.175 0.000 N 238 4 9.965 0.000 H 216 5 125.165 0.000 N 216 In the current version, only H and N atoms are considered. Returns: a list fo Peak objects. """ fin = open(peaklist_file, 'r') peakList = [] current_residue = None count_residue = 0 counter = -1 eval_elements = [ str.isdigit, eval_str_to_float, eval_str_to_float, str.isalpha, str.isdigit ] for line in fin: counter += 1 ls = line.strip().split() if not ls: continue elif all([f(e) for e, f in zip(ls, eval_elements)]) \ and len(ls) == 5: pass else: msg = "The peaklist {} contains a wrong line format in line {}."\ .format(peaklist_file, counter) wet29 = fsw(msg_title='ERROR', msg=msg, wet_num=29) print(wet29.wet) wet29.abort() if ls[3] not in ('N', 'H'): continue if ls[-1] != current_residue: current_residue = ls[-1] position = [ls[1]] atom = [ls[3]] count_residue += 1 elif ls[-1] == current_residue: position.append(ls[1]) atom.append(ls[3]) peak = Peak( peak_number=count_residue, positions=position, residue_number=current_residue, residue_type=None, atoms=atom, linewidths=[0, 0], volume=0, height=0, format_='user_pkl_1' ) peakList.append(peak) fin.close() return peakList
def get_peaklist_format(file_path): fin = open(file_path, 'r') if len(file_path.split('.')) < 2: print('Invalid File Extension') return "Not accepted suffix" file_ext = file_path.split('.')[-1] if file_ext not in file_extensions: msg = \ """*** The following file was not recognised as a valid peaklist *** {} *** suffix not in accepted formats. Accepted formats are: *** *.peaks *.xpk *.out and *.csv (CCPNMR2) *** visit folder Documentation/Accepted_Peaklists_Formats for more information. *** If this file is not a peaklists, simply IGNORE this message. """.\ format(file_path) print(msg) #print('Invalid File Extension. Suffix not in accepted format.') return "Not accepted suffix" for line in fin: ls = line.strip().split() if not line.strip(): continue elif file_ext == 'peaks' \ and (line.lstrip().startswith("Assignment") and "w1" in line) \ or line.startswith("<sparky save file>"): fin.close() return "SPARKY" elif file_ext == 'peaks' \ and line.lstrip().startswith("ANSIG") and "crosspeak" in line: fin.close() return "ANSIG" elif file_ext == 'peaks' \ and line.startswith("DATA") and "X_AXIS" in line: fin.close() return "NMRDRAW" elif file_ext == 'xpk' \ and line.split()[0].isdigit() and line.split()[1].startswith('{'): fin.close() return "NMRVIEW" # because columns in ccpnmr peaklists may be swapped elif file_ext == 'csv' \ and set(line.strip().split(',')) == ccpnmr_headers: fin.close() return "CCPNMRV2" elif file_path.endswith('.prot') \ and line.strip().split()[0].isdigit() \ and line.strip().split()[-1].isdigit() \ and len(ls) == 5 \ and all([f(e) for e, f in zip(ls, eval_elements_usr_pkl_1)]): fin.close() return "USER_PKL_1" elif file_path.endswith('.str') \ and (line.strip() == 'loop_' \ or line.strip() == '_Atom_shift_assign_ID'): fin.close() return "USER_PKL_2" elif file_ext == 'csv' \ and set(line.strip().rstrip(',').split(',')) == user3_headers: fin.close() return "USER_PKL_3" elif file_ext == 'list' and line == user4_header: fin.close() return "USER_PKL_4" elif file_ext == 'csv' \ and set(line.strip().split(',')).issubset(ccpnmr_headers): fin.close() return "USER_PKL_5" # INSERT YOUR VALIDATION CODE HERE # SO THAT YOU PEAKLIST FORMAT IS RECOGNIZED #elif ****: #fin.close() #return "YOUR_FORMAT" else: continue else: msg = \ """We could not read peaklist file: {}. Mostly likely due to a bad peaklist formatting syntax. """.\ format(file_path) print(fsw(msg_title="ERROR", msg=msg, wet_num=30).wet) return "Bad peaklist format"