def test_batch_fit(self, tmp_path): hdx_set = HDXMeasurementSet([self.hdxm_apo, self.hdxm_dimer]) guess = csv_to_dataframe(output_dir / 'ecSecB_guess.csv') # Create rates dataframe rates_df = pd.DataFrame( {name: guess['rate'] for name in hdx_set.names}) gibbs_guess = hdx_set.guess_deltaG(rates_df) fr_global = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000) fpath = Path(tmp_path) / 'fit_result_batch.csv' fr_global.to_file(fpath) df = csv_to_dataframe(fpath) assert df.attrs['metadata'] == fr_global.metadata output = fr_global.output check_protein = csv_to_protein(output_dir / 'ecSecB_batch.csv') states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['dG'] test = check_protein[state]['dG'] assert_series_equal(result, test, rtol=0.1) errors = fr_global.get_squared_errors() assert errors.shape == (hdx_set.Ns, hdx_set.Np, hdx_set.Nt) mock_alignment = { 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', } hdx_set.add_alignment(list(mock_alignment.values())) gibbs_guess = hdx_set[0].guess_deltaG( guess['rate']) # Guesses from first measurement aligned_result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) output = aligned_result.output check_protein = csv_to_protein(output_dir / 'ecSecB_batch_aligned.csv') states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['dG'] test = check_protein[state]['dG'] assert_series_equal(result, test, rtol=0.1)
def yaml_to_hdxmset(yaml_dict, data_dir=None, **kwargs): """reads files according to `yaml_dict` spec from `data_dir into HDXMEasurementSet""" hdxm_list = [] for k, v in yaml_dict.items(): hdxm = yaml_to_hdxm(v, data_dir=data_dir, name=k) hdxm_list.append(hdxm) return HDXMeasurementSet(hdxm_list)
def test_batch_fit(self): hdx_set = HDXMeasurementSet([self.series_apo, self.series_dimer]) guess = csv_to_protein( os.path.join(directory, 'test_data', 'ecSecB_guess.txt')) gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000) output = result.output check_protein = csv_to_protein(os.path.join(directory, 'test_data', 'ecSecB_batch.csv'), column_depth=2) states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['deltaG'] test = check_protein[state]['deltaG'] assert_series_equal(result, test, rtol=0.1) mock_alignment = { 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', } hdx_set.add_alignment(list(mock_alignment.values())) gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) aligned_result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) output = aligned_result.output check_protein = csv_to_protein(os.path.join( directory, 'test_data', 'ecSecB_batch_aligned.csv'), column_depth=2) states = ['SecB WT apo', 'SecB his dimer apo'] for state in states: from pandas.testing import assert_series_equal result = output[state]['deltaG'] test = check_protein[state]['deltaG'] assert_series_equal(result, test, rtol=0.1)
current_dir = Path(__file__).parent #current_dir = Path().cwd() / 'templates' # pycharm scientific compat output_dir = current_dir / 'output' output_dir.mkdir(exist_ok=True) data_dir = current_dir.parent / 'tests' / 'test_data' data = read_dynamx(data_dir / 'input' / 'ecSecB_apo.csv', data_dir / 'input' / 'ecSecB_dimer.csv') pmt = PeptideMasterTable(data) pmt.set_control(('Full deuteration control', 0.167*60)) st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30) st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30) hdx_set = HDXMeasurementSet([st1, st2]) guess = csv_to_protein(data_dir / 'output' / 'ecSecB_guess.csv') gibbs_guess = hdx_set[0].guess_deltaG(guess['rate']) # Example fit with only 5000 epochs and high learning rate # Checkpoint stores model history every `epoch_step` epochs checkpoint = CheckPoint(epoch_step=250) result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=0.5, r2=0.1, epochs=5000, lr=1e5, callbacks=[checkpoint]) print(f"MSE loss: {result.mse_loss:.2f}, " f"Reg loss: {result.reg_loss:.2f}, " f"Reg percent: {result.regularization_percentage:.0f}%") df = checkpoint.to_dataframe(hdx_set.names) dataframe_to_file(output_dir / 'model_history.csv', df)
data_dir = current_dir.parent / 'tests' / 'test_data' yaml_stream = Path(current_dir / 'yaml_files' / 'SecB.yaml').read_text() data_dict = yaml.safe_load(yaml_stream) output_dir = current_dir / 'fit' output_dir.mkdir(exist_ok=True) hdxm_list = [ load_from_yaml(dic, data_dir=data_dir, name=name) for name, dic in data_dict.items() ] rates_list = [ csv_to_protein(current_dir / 'guesses' / f'{name}_rates_guess.txt')['rate'] for name in data_dict.keys() ] hdx_set = HDXMeasurementSet(hdxm_list) gibbs_guess = hdx_set.guess_deltaG(rates_list) log_file = output_dir / f"fitting_log.txt" now = datetime.now() date = f'# {now.strftime("%Y/%m/%d %H:%M:%S")} ({int(now.timestamp())})' lines = [VERSION_STRING, date] r2 = 0.5 for r1 in [0, 0.01, 0.25, 0.5, 1]: t0 = time.time() result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000,
'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', } current_dir = Path(__file__).parent data_dir = current_dir.parent / 'tests' / 'test_data' data = read_dynamx(data_dir / 'ecSecB_apo.csv', data_dir / 'ecSecB_dimer.csv') pmt = PeptideMasterTable(data) pmt.set_control(('Full deuteration control', 0.167)) st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30) st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30) guess = csv_to_protein(data_dir / 'ecSecB_guess.txt') hdx_set = HDXMeasurementSet([st1, st2]) gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) hdx_set.add_alignment(list(mock_alignment.values())) result = fit_gibbs_global_batch_aligned(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) print(result.output)
if guess: client = default_client() wt_avg_result = fit_rates_weighted_average(hdxm, bounds=(1e-2, 800)) output = wt_avg_result.output output.to_file(directory / 'test_data' / 'ecSecB_guess.txt') else: output = csv_to_protein(directory / 'test_data' / 'ecSecB_guess.txt') gibbs_guess = hdxm.guess_deltaG(output['rate']) fr_torch = fit_gibbs_global(hdxm, gibbs_guess, epochs=epochs, r1=2) fr_torch.output.to_file(directory / 'test_data' / 'ecSecB_torch_fit.txt') hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'), sequence=sequence_dimer, temperature=temperature, pH=pH) hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm]) gibbs_guess = hdx_set.guess_deltaG([output['rate'], output['rate']]) batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs) batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.csv') batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.txt', fmt='pprint') # Order is inverted compared to test! mock_alignment = { 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA', } hdx_set.add_alignment(list(mock_alignment.values()))
def hdx_set(self): return HDXMeasurementSet(list(self.hdxm_objects.values()))
def fit_gibbs_global( hdxm, initial_guess, r1=R1, epochs=EPOCHS, patience=PATIENCE, stop_loss=STOP_LOSS, optimizer="SGD", callbacks=None, **optimizer_kwargs, ): """ Fit Gibbs free energies globally to all D-uptake data in the supplied hdxm Parameters ---------- hdxm : :class:`~pyhdx.models.HDXMeasurement` Input HDX measurement initial_guess : :class:`~pandas.Series` or :class:`~numpy.ndarray` Gibbs free energy initial guesses (shape Nr, units J/mol) r1 : :obj:`float` Regularizer value r1 (along residues) epochs: :obj:`int` Maximum number of fitting iterations patience: :obj:`int` Number of epochs to wait until termination when progress between epochs is below `stop_loss` stop_loss: :obj:`float` Threshold for difference in loss between epochs when an epoch is considered to make no more progress. optimizer : :obj:`str` Which optimizer to use. Default is Stochastic Gradient Descent. See PyTorch documentation for information. callbacks: :obj:`list` or None List of callback objects. Call signature is callback(epoch, model, optimizer) **optimizer_kwargs Additional keyword arguments passed to the optimizer. Returns ------- result: :class:`~pyhdx.fitting_torch.TorchSingleFitResult` """ fit_keys = ["r1", "epochs", "patience", "stop_loss", "optimizer"] locals_dict = locals() fit_kwargs = {k: locals_dict[k] for k in fit_keys} tensors = hdxm.get_tensors() inputs = [ tensors[key] for key in ["temperature", "X", "k_int", "timepoints"] ] output_data = tensors["d_exp"] if isinstance(initial_guess, pd.Series): assert (initial_guess.index.inferred_type == "integer" ), "Invalid dtype for initial guess index, must be 'integer'" # Map guesses to covered residue range and fill NaN gaps initial_guess = initial_guess.reindex( hdxm.coverage.r_number).interpolate(limit_direction="both") initial_guess = initial_guess.to_numpy() assert len(initial_guess) == hdxm.Nr, "Invalid length of initial guesses" assert not np.any(np.isnan(initial_guess)), "Initial guess has NaN entries" dtype = torch.float64 dG_par = torch.nn.Parameter( torch.tensor(initial_guess, dtype=cfg.TORCH_DTYPE, device=cfg.TORCH_DEVICE).unsqueeze(-1)) # reshape (nr, 1) model = DeltaGFit(dG_par) criterion = torch.nn.MSELoss(reduction="mean") # Take default optimizer kwargs and update them with supplied kwargs optimizer_kwargs = { **optimizer_defaults.get(optimizer, {}), **optimizer_kwargs, } # Take defaults and override with user-specified optimizer_klass = getattr(torch.optim, optimizer) reg_func = partial(regularizer_1d, r1) # returned_model is the same object as model losses_array, returned_model = run_optimizer( inputs, output_data, optimizer_klass, optimizer_kwargs, model, criterion, reg_func, epochs=epochs, patience=patience, stop_loss=stop_loss, callbacks=callbacks, ) losses = _loss_df(losses_array) fit_kwargs.update(optimizer_kwargs) hdxm_set = HDXMeasurementSet([hdxm]) result = TorchFitResult(hdxm_set, model, losses=losses, **fit_kwargs) return result
from pyhdx.fileIO import csv_to_protein current_dir = Path(__file__).parent data_dir = current_dir.parent / 'tests' / 'test_data' data = read_dynamx(data_dir / 'ecSecB_apo.csv', data_dir / 'ecSecB_dimer.csv') pmt = PeptideMasterTable(data) pmt.set_control(('Full deuteration control', 0.167)) st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30) st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30) hdx_set = HDXMeasurementSet([st1, st2]) guess = csv_to_protein(data_dir / 'ecSecB_guess.txt') gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']]) # Example fit with only 1000 epochs and high regularizers # For real data start with parameters r1=0.05, r2=0.5, epochs=100000 result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000) #Human readable output result.output.to_file('Batch_fit_result.txt', fmt='pprint') #Machine readable output result.output.to_file('Batch_fit_result.csv', fmt='csv')
dataframe_to_file(output_dir / f'ecSecB_torch_fit_epochs_{epochs_long}.csv', fr_torch.output) dataframe_to_file(output_dir / f'ecSecB_torch_fit_epochs_{epochs_long}.txt', fr_torch.output, fmt='pprint') # ---------- # Batch fits # ---------- hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'), sequence=sequence_dimer, temperature=temperature, pH=pH) hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm]) gibbs_guess = hdx_set[0].guess_deltaG(guess_output['rate']) batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs) dataframe_to_file(output_dir / 'ecSecB_batch.csv', batch_result.output) dataframe_to_file(output_dir / 'ecSecB_batch.txt', batch_result.output, fmt='pprint') # Order is inverted compared to test! mock_alignment = { 'dimer': 'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------', 'apo': 'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
def hdx_set(self): """Returns combined HDXMeasurementSet of all currently added data objects""" #todo when alignments are added in, update this as (fixed) attribute return HDXMeasurementSet(list(self.data_objects.values()))