def test_batch_fit_delta(self, tmp_path): yaml_file = input_dir / 'data_states_deltas.yaml' yaml_dict = yaml.safe_load(yaml_file.read_text()) hdxm_set = yaml_to_hdxmset(yaml_dict, data_dir=input_dir) guess_output = csv_to_dataframe(output_dir / 'ecSecB_guess.csv') gibbs_guess = hdxm_set[0].guess_deltaG(guess_output['rate']) # broadcast single guess over samples fr_global = fit_gibbs_global_batch(hdxm_set, gibbs_guess, epochs=200) output = fr_global.output check = csv_to_dataframe(output_dir / 'ecsecb_delta_batch' / 'fit_result.csv') states = check.columns.unique(level=0) for state in states: from pandas.testing import assert_series_equal result = output[state]['dG'] test = check[state]['dG'] assert_series_equal(result, test, rtol=0.1) errors = fr_global.get_squared_errors() assert errors.shape == (hdxm_set.Ns, hdxm_set.Np, hdxm_set.Nt) assert not np.any(np.isnan(errors))
def test_batch_fit(self, tmp_path): hdx_set = HDXMeasurementSet([self.hdxm_apo, self.hdxm_dimer]) guess = csv_to_dataframe(output_dir / 'ecSecB_guess.csv') # Create rates dataframe rates_df = pd.DataFrame( {name: guess['rate'] for name in hdx_set.names}) gibbs_guess = hdx_set.guess_deltaG(rates_df) fr_global = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000) fpath = Path(tmp_path) / 'fit_result_batch.csv' fr_global.to_file(fpath) df = csv_to_dataframe(fpath) assert df.attrs['metadata'] == fr_global.metadata output = fr_global.output check_protein = csv_to_protein(output_dir / 'ecSecB_batch.csv') states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['dG'] test = check_protein[state]['dG'] assert_series_equal(result, test, rtol=0.1) errors = fr_global.get_squared_errors() assert errors.shape == (hdx_set.Ns, hdx_set.Np, hdx_set.Nt) mock_alignment = { 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', } hdx_set.add_alignment(list(mock_alignment.values())) gibbs_guess = hdx_set[0].guess_deltaG( guess['rate']) # Guesses from first measurement aligned_result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) output = aligned_result.output check_protein = csv_to_protein(output_dir / 'ecSecB_batch_aligned.csv') states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['dG'] test = check_protein[state]['dG'] assert_series_equal(result, test, rtol=0.1)
def test_batch_fit(self): hdx_set = HDXMeasurementSet([self.series_apo, self.series_dimer]) guess = csv_to_protein( os.path.join(directory, 'test_data', 'ecSecB_guess.txt')) gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000) output = result.output check_protein = csv_to_protein(os.path.join(directory, 'test_data', 'ecSecB_batch.csv'), column_depth=2) states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['deltaG'] test = check_protein[state]['deltaG'] assert_series_equal(result, test, rtol=0.1) mock_alignment = { 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', } hdx_set.add_alignment(list(mock_alignment.values())) gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) aligned_result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) output = aligned_result.output check_protein = csv_to_protein(os.path.join( directory, 'test_data', 'ecSecB_batch_aligned.csv'), column_depth=2) states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['deltaG'] test = check_protein[state]['deltaG'] assert_series_equal(result, test, rtol=0.1)
pmt = PeptideMasterTable(data) pmt.set_control(('Full deuteration control', 0.167*60)) st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30) st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30) hdx_set = HDXMeasurementSet([st1, st2]) guess = csv_to_protein(data_dir / 'output' / 'ecSecB_guess.csv') gibbs_guess = hdx_set[0].guess_deltaG(guess['rate']) # Example fit with only 5000 epochs and high learning rate # Checkpoint stores model history every `epoch_step` epochs checkpoint = CheckPoint(epoch_step=250) result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=0.5, r2=0.1, epochs=5000, lr=1e5, callbacks=[checkpoint]) print(f"MSE loss: {result.mse_loss:.2f}, " f"Reg loss: {result.reg_loss:.2f}, " f"Reg percent: {result.regularization_percentage:.0f}%") df = checkpoint.to_dataframe(hdx_set.names) dataframe_to_file(output_dir / 'model_history.csv', df) dataframe_to_file(output_dir / 'model_history.txt', df, fmt='pprint') # Checkpoint history scatter plot # Note that these are raw dG values including interpolated values in regions of no coverage history = checkpoint.model_history num = len(history) cmap = mpl.cm.get_cmap('winter')
hdx_set = HDXMeasurementSet(hdxm_list) gibbs_guess = hdx_set.guess_deltaG(rates_list) log_file = output_dir / f"fitting_log.txt" now = datetime.now() date = f'# {now.strftime("%Y/%m/%d %H:%M:%S")} ({int(now.timestamp())})' lines = [VERSION_STRING, date] r2 = 0.5 for r1 in [0, 0.01, 0.25, 0.5, 1]: t0 = time.time() result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000, r1=r1, r2=r2) t1 = time.time() block = '--------------------------' regularizers = f'Regualizer 1: {r1} Regualizer 2: {r2}' loss = f'Total_loss {result.total_loss:.2f}, mse_loss {result.mse_loss:.2f}, reg_loss {result.reg_loss:.2f}' \ f'({result.regularization_percentage:.2f}%)' time_elapsed = f"Time elapsed: {(t1 - t0):.2f} s" epochs = f"Number of epochs: {len(result.metadata['total_loss'])}" result.output.to_csv(output_dir / f"fit_output_r1_{r1}_r2_{r2}.csv") #, na_rep='NaN') result.output.to_file(output_dir / f"fit_output_r1_{r1}_r2_{r2}.txt", fmt='pprint',
output = wt_avg_result.output output.to_file(directory / 'test_data' / 'ecSecB_guess.txt') else: output = csv_to_protein(directory / 'test_data' / 'ecSecB_guess.txt') gibbs_guess = hdxm.guess_deltaG(output['rate']) fr_torch = fit_gibbs_global(hdxm, gibbs_guess, epochs=epochs, r1=2) fr_torch.output.to_file(directory / 'test_data' / 'ecSecB_torch_fit.txt') hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'), sequence=sequence_dimer, temperature=temperature, pH=pH) hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm]) gibbs_guess = hdx_set.guess_deltaG([output['rate'], output['rate']]) batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs) batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.csv') batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.txt', fmt='pprint') # Order is inverted compared to test! mock_alignment = { 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', } hdx_set.add_alignment(list(mock_alignment.values())) aligned_result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000)
from pyhdx.fileIO import csv_to_protein current_dir = Path(__file__).parent data_dir = current_dir.parent / 'tests' / 'test_data' data = read_dynamx(data_dir / 'ecSecB_apo.csv', data_dir / 'ecSecB_dimer.csv') pmt = PeptideMasterTable(data) pmt.set_control(('Full deuteration control', 0.167)) st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30) st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30) hdx_set = HDXMeasurementSet([st1, st2]) guess = csv_to_protein(data_dir / 'ecSecB_guess.txt') gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) # Example fit with only 1000 epochs and high regularizers # For real data start with parameters r1=0.05, r2=0.5, epochs=100000 result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) #Human readable output result.output.to_file('Batch_fit_result.txt', fmt='pprint') #Machine readable output result.output.to_file('Batch_fit_result.csv', fmt='csv')
fr_torch.output, fmt='pprint') # ---------- # Batch fits # ---------- hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'), sequence=sequence_dimer, temperature=temperature, pH=pH) hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm]) gibbs_guess = hdx_set[0].guess_deltaG(guess_output['rate']) batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs) dataframe_to_file(output_dir / 'ecSecB_batch.csv', batch_result.output) dataframe_to_file(output_dir / 'ecSecB_batch.txt', batch_result.output, fmt='pprint') # Order is inverted compared to test! mock_alignment = { 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', } hdx_set.add_alignment(list(mock_alignment.values()))
"""Obtain ΔG for ecSecB tetramer and dimer""" from pathlib import Path from pyhdx.batch_processing import yaml_to_hdxmset from pyhdx.fileIO import csv_to_dataframe, save_fitresult from pyhdx.fitting import fit_gibbs_global_batch import yaml cwd = Path(__file__).parent data_dir = cwd / 'test_data' / 'input' output_dir = cwd / 'test_data' / 'output' yaml_dict = yaml.safe_load(Path(data_dir / 'data_states.yaml').read_text()) hdx_set = yaml_to_hdxmset(yaml_dict, data_dir=data_dir) initial_guess_rates = csv_to_dataframe(output_dir / 'ecSecB_guess.csv') guesses = hdx_set[0].guess_deltaG(initial_guess_rates['rate']) fit_kwargs = yaml.safe_load(Path(data_dir / 'fit_settings.yaml').read_text()) fr = fit_gibbs_global_batch(hdx_set, guesses, **fit_kwargs) save_fitresult(output_dir / 'ecsecb_tetramer_dimer', fr)