def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5, crf=False): if cv: cv_folds = make_folds(X, Y, folds) accuracies = [] fscores = [] for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): self.tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional, crf) pred_probs, pred_label_seqs, x_lens = self.predict( test_fold_X, bidirectional, tagger=self.tagger) pred_inds = np.argmax(pred_probs, axis=2) flattened_preds = [] flattened_targets = [] for x_len, pred_ind, test_target in zip( x_lens, pred_inds, test_fold_Y): flattened_preds.extend(pred_ind[-x_len:]) flattened_targets.extend( [list(tt).index(1) for tt in test_target[-x_len:]]) assert len(flattened_preds) == len(flattened_targets) accuracy, weighted_fscore, all_fscores = evaluate( flattened_targets, flattened_preds) print("Finished fold %d. Accuracy: %f, Weighted F-score: %f" % (fold_num, accuracy, weighted_fscore)) print("Individual f-scores:") for cat in all_fscores: print("%s: %f" % (self.rev_label_ind[cat], all_fscores[cat])) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = np.asarray(accuracies) fscores = np.asarray(fscores) print("Accuracies:", accuracies) print("Average: %0.4f (+/- %0.4f)" % (accuracies.mean(), accuracies.std() * 2)) print(sys.stderr, "Fscores:", fscores) print( sys.stderr, "Average: %0.4f (+/- %0.4f)" % (fscores.mean(), fscores.std() * 2)) else: self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional, crf) model_ext = "att=%s_cont=%s_bi=%s" % (str(use_attention), att_context, str(bidirectional)) model_config_file = open("model_%s_config.json" % model_ext, "w") model_weights_file_name = "model_%s_weights" % model_ext model_label_ind = "model_%s_label_ind.json" % model_ext model_rep_reader = "model_%s_rep_reader.pkl" % model_ext self.tagger.save_weights(model_weights_file_name, overwrite=True) json.dump(self.label_ind, open(model_label_ind, "w")) pickle.dump(self.rep_reader, open(model_rep_reader, "wb"))
def train(self, X, Y, use_attention, att_context, bidirectional, cv=True, folds=5): if cv: cv_folds = make_folds(X, Y, folds) accuracies = [] fscores = [] for fold_num, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): tagger = self.fit_model(train_fold_X, train_fold_Y, use_attention, att_context, bidirectional) pred_probs, pred_label_seqs, x_lens = self.predict(test_fold_X, bidirectional, tagger=tagger) pred_inds = numpy.argmax(pred_probs, axis=2) flattened_preds = [] flattened_targets = [] for x_len, pred_ind, test_target in zip(x_lens, pred_inds, test_fold_Y): flattened_preds.extend(pred_ind[-x_len:]) flattened_targets.extend([list(tt).index(1) for tt in test_target[-x_len:]]) assert len(flattened_preds) == len(flattened_targets) accuracy, weighted_fscore, all_fscores = evaluate(flattened_targets, flattened_preds) print >>sys.stderr, "Finished fold %d. Accuracy: %f, Weighted F-score: %f"%(fold_num, accuracy, weighted_fscore) print >>sys.stderr, "Individual f-scores:" for cat in all_fscores: print >>sys.stderr, "%s: %f"%(self.rev_label_ind[cat], all_fscores[cat]) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >>sys.stderr, "Accuracies:", accuracies print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2) print >>sys.stderr, "Fscores:", fscores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2) self.tagger = self.fit_model(X, Y, use_attention, att_context, bidirectional) model_ext = "att=%s_cont=%s_bi=%s"%(str(use_attention), att_context, str(bidirectional)) model_config_file = open("model_%s_config.json"%model_ext, "w") model_weights_file_name = "model_%s_weights"%model_ext model_label_ind = "model_%s_label_ind.json"%model_ext model_rep_reader = "model_%s_rep_reader.pkl"%model_ext print >>model_config_file, self.tagger.to_json() self.tagger.save_weights(model_weights_file_name, overwrite=True) json.dump(self.label_ind, open(model_label_ind, "w")) pickle.dump(self.rep_reader, open(model_rep_reader, "wb"))
def train(self, trainfile_name): train_X, train_Y, num_classes = self.make_data(trainfile_name) accuracies = [] fscores = [] if self.cv: cv_folds = make_folds(train_X, train_Y, self.folds) for i, ((train_fold_X, train_fold_Y), (test_fold_X, test_fold_Y)) in enumerate(cv_folds): classifier = self.fit_model(train_fold_X, train_fold_Y, num_classes) predictions = self.classify(classifier, test_fold_X) accuracy, weighted_fscore, _ = evaluate(test_fold_Y, predictions) print >>sys.stderr, "Finished fold %d. Accuracy: %f, F-score: %f"%(i, accuracy, weighted_fscore) accuracies.append(accuracy) fscores.append(weighted_fscore) accuracies = numpy.asarray(accuracies) fscores = numpy.asarray(fscores) print >>sys.stderr, "Accuracies:", accuracies print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(accuracies.mean(), accuracies.std() * 2) print >>sys.stderr, "Fscores:", fscores print >>sys.stderr, "Average: %0.4f (+/- %0.4f)"%(fscores.mean(), fscores.std() * 2) #self.classifier = self.fit_model(train_X, train_Y, num_classes) #cPickle.dump(classifier, open(self.trained_model_name, "wb")) #pickle.dump(tagset, open(self.stored_tagset, "wb")) print >>sys.stderr, "Done"
def fit_bac(cdata, edata, out_dir, uncertainties=None, geos=None, geo_exceptions=None, mults=None, val_split=0.0, folds=1, use_atom_features=False, global_min=False, global_min_iter=10, lam=0.0): """ cdata: Dictionary of calculated data edata: Dictionary of experimental data out_dir: Output directory uncertainties: Dictionary of uncertainties geos: Use BAC form with atom/bond types if geometries are not provided geo_exceptions: Override the geometry check for these identifiers mults: Dictionary of multiplicities val_split: Fraction of data to use as validation set folds: Number of folds for cross-validation (overrides val_split if >1) use_atom_features: Use atom features instead of bond features global_min: Use the basin hopping algorithm for global minimization lam: Regularization parameter Return dictionary of new calculated data """ bond_types = True if geos is None else False # Only use calculated molecules that are also in experimental ones ids, mols, hexpt, hcalc, weights = [], [], [], [], [] for ident, h in edata.iteritems(): if ident in cdata: ids.append(ident) if bond_types: mols.append(str_to_mol(ident)) else: mol = geo_to_mol(geos[ident]) if ident not in geo_exceptions: mol_check = str_to_mol(ident, single_bonds=True) if not mol_check.isIsomorphic(mol): raise Exception( 'Geometry does not match identifier {}'.format( ident)) if mults: mol.multiplicity = mults[ident] mols.append(mol) hexpt.append(h) hcalc.append(cdata[ident]) if uncertainties: weights.append(1.0 / uncertainties[ident]**2.0) hexpt = np.array(hexpt) hcalc = np.array(hcalc) weights = np.array(weights) rmse_prev = calc_rmse(hexpt, hcalc) mae_prev = calc_mae(hexpt, hcalc) if not os.path.exists(out_dir): os.makedirs(out_dir) # Display the 10 worst ones before fitting diff = hexpt - hcalc print('Worst ones before fitting:') large_diff = [(ids[i], abs(d), hexpt[i], hcalc[i]) for i, d in enumerate(diff)] large_diff.sort(key=lambda _x: _x[1], reverse=True) for ident, d, hexpti, hcalci in large_diff[:10][::-1]: print('{} {: .2f} {: .2f} {: .2f}'.format(ident, d, hexpti, hcalci)) print with open(os.path.join(out_dir, 'worst_errors.txt'), 'w') as f: f.write('Worst ones before fitting:\n') for ident, d, hexpti, hcalci in large_diff[:10][::-1]: f.write('{} {: .2f} {: .2f} {: .2f}\n'.format( ident, d, hexpti, hcalci)) f.write('\n') # Shuffle data and set up some arrays if folds > 1: shuffle_arrays(ids, mols, hexpt, hcalc) rmses, rmses_train, rmses_val = [], [], [] maes, maes_train, maes_val = [], [], [] hbacs, output_strs, param_dicts = [], [], [] def _postprocess(_hexpt, _hbac, _hexpt_train, _hbac_train, _hexpt_val, _hbac_val, _output_str, _param_dict): rmses.append(calc_rmse(_hexpt, _hbac)) maes.append(calc_mae(_hexpt, _hbac)) _rmse_train = calc_rmse(_hexpt_train, _hbac_train) rmses_train.append(_rmse_train) maes_train.append(calc_mae(_hexpt_train, _hbac_train)) _rmse_val = calc_rmse(_hexpt_val, _hbac_val) rmses_val.append(_rmse_val) maes_val.append(calc_mae(_hexpt_val, _hbac_val)) hbacs.append(_hbac) output_strs.append(_output_str) param_dicts.append(_param_dict) print('RMSE train/val: {:.2f}/{:.2f}'.format(_rmse_train, _rmse_val)) print('Parameters:') print(output_str) if bond_types: # Get number of atoms or bonds of each type as features if use_atom_features: features = [ get_features(mol, atom_features=True, bond_features=False) for mol in mols ] else: features = [ get_features(mol, atom_features=False, bond_features=True) for mol in mols ] feature_keys = list({k for f in features for k in f}) feature_keys.sort() x, nocc = make_feature_mat(features, feature_keys) # for idx in np.where(nocc <= 1)[0][::-1]: # Remove features if they only occur once # del feature_keys[idx] # x, nocc = make_feature_mat(features, feature_keys) data = (features, hexpt, hcalc, weights) folded_data = make_folds(folds, *data) for fold_num in range(folds): print('Fold {}'.format(fold_num + 1)) if folds > 1: split_data = concat_folds(fold_num, *folded_data) else: # Split off validation data split_data = split_arrays(1.0 - val_split, *data) features_train, features_val, hexpt_train, hexpt_val, hcalc_train, hcalc_val, weights_train, _ = split_data y_train = hexpt_train - hcalc_train x_train, nocc = make_feature_mat(features_train, feature_keys) weight_mat = np.diag( weights_train) if np.size(weights_train) > 0 else np.eye( len(x_train)) w, ypred = lin_reg(x_train, y_train, weight_mat) hbac_train = hcalc_train + ypred xval, _ = make_feature_mat(features_val, feature_keys) hbac = hcalc + np.dot(x, w) hbac_val = hcalc_val + np.dot(xval, w) output_str = '' for fk, wi, n in zip(feature_keys, w, nocc): output_str += '{:<5} {: .4f} {}\n'.format(fk, wi, n) param_dict = {fk: wi for fk, wi in zip(feature_keys, w)} _postprocess(hexpt, hbac, hexpt_train, hbac_train, hexpt_val, hbac_val, output_str, param_dict) else: # Technically, it's possible that some atom type is not present in mols_train, but that's very unlikely all_atom_symbols = list( {atom.element.symbol for mol in mols for atom in mol.atoms}) all_atom_symbols.sort() nelements = len(all_atom_symbols) low, high = -1e6, 1e6 # Arbitrarily large, just so that we can use bounds in global minimization if mults: w0 = np.zeros(3 * nelements + 1) + 1e-6 # Order is a, aii, b, k wmin = [low] * nelements + [0] * nelements + [low] * nelements + [ low ] wmax = [high] * (3 * nelements + 1) else: w0 = np.zeros(3 * nelements) + 1e-6 # Order is a, aii, b wmin = [low] * nelements + [0] * nelements + [low] * nelements wmax = [high] * 3 * nelements bounds = [(l, h) for l, h in zip(wmin, wmax)] data = (mols, hexpt, hcalc, weights) folded_data = make_folds(folds, *data) for fold_num in range(folds): print('Fold {}'.format(fold_num + 1)) if folds > 1: split_data = concat_folds(fold_num, *folded_data) else: # Split off validation data split_data = split_arrays(1.0 - val_split, *data) mols_train, mols_val, hexpt_train, hexpt_val, hcalc_train, hcalc_val, weights_train, _ = split_data weight_mat = np.diag( weights_train) if np.size(weights_train) > 0 else np.eye( len(mols_train)) minimizer_kwargs = dict( method='SLSQP', # Gradient-free minimization is a lot faster args=(all_atom_symbols, mols_train, hexpt_train, hcalc_train, weight_mat, lam), bounds=bounds) if global_min: take_step = RandomDisplacementBounds(wmin, wmax) res = scipy.optimize.basinhopping( objfun, w0, niter=global_min_iter, minimizer_kwargs=minimizer_kwargs, take_step=take_step, disp=True) else: res = scipy.optimize.minimize(objfun, w0, **minimizer_kwargs) w = res.x print(res.fun) hbac = get_hbac(w, all_atom_symbols, mols, hcalc) hbac_train = get_hbac(w, all_atom_symbols, mols_train, hcalc_train) hbac_val = get_hbac(w, all_atom_symbols, mols_val, hcalc_val) a, aii, b, k = get_params(w, all_atom_symbols) param_dict = {'a': a, 'aii': aii, 'b': b, 'k': k} output_str = 'Atom A B Aii\n' for s, wi in a.iteritems(): output_str += ' {:<3} {: .4f} {: .4f} {: .4f}\n'.format( s, wi, b[s], aii[s]) output_str += 'K = {:.4f}\n'.format(k) _postprocess(hexpt, hbac, hexpt_train, hbac_train, hexpt_val, hbac_val, output_str, param_dict) # Display the 10 worst ones after fitting (averaged across models) hbac_mean = np.mean(hbacs, axis=0) diff_new = hexpt - hbac_mean print('Worst ones after fitting:') large_diff = [(ids[i], abs(d), hexpt[i], hbac_mean[i]) for i, d in enumerate(diff_new)] large_diff.sort(key=lambda _x: _x[1], reverse=True) for ident, d, hexpti, hbaci in large_diff[:10][::-1]: print('{} {: .2f} {: .2f} {: .2f}'.format(ident, d, hexpti, hbaci)) with open(os.path.join(out_dir, 'worst_errors.txt'), 'a') as f: f.write('Worst ones after fitting:\n') for ident, d, hexpti, hbaci in large_diff[:10][::-1]: f.write('{} {: .2f} {: .2f} {: .2f}\n'.format( ident, d, hexpti, hbaci)) rmse = np.mean(rmses) mae = np.mean(maes) rmse_train = np.mean(rmses_train) mae_train = np.mean(maes_train) rmse_val = np.mean(rmses_val) mae_val = np.mean(maes_val) print('\nAverages:') print('RMSE train/val: {:.2f}/{:.2f}'.format(rmse_train, rmse_val)) print('MAE train/val: {:.2f}/{:.2f}'.format(mae_train, mae_val)) print('Total RMSE before/after fitting: {:.2f}/{:.2f}'.format( rmse_prev, rmse)) print('Total MAE before/after fitting: {:.2f}/{:.2f}'.format( mae_prev, mae)) error_path = os.path.join(out_dir, 'errors.txt') bac_path = os.path.join(out_dir, 'bacs.txt') json_path = os.path.join(out_dir, 'bacs.json') with open(error_path, 'w') as f: f.write('RMSE before/after: {:.2f}/{:.2f}\n'.format(rmse_prev, rmse)) f.write('MAE before/after: {:.2f}/{:.2f}\n'.format(mae_prev, mae)) with open(bac_path, 'w') as f: for output_str in output_strs: f.write(output_str + '\n') with open(json_path, 'w') as f: if len(param_dicts) > 1: json.dump(param_dicts, f, indent=4, separators=(',', ': ')) else: json.dump(param_dicts[0], f, indent=4, separators=(',', ': ')) return collections.OrderedDict(zip(ids, hbac_mean))