if config.dataset_path != 'imdb' and config.dataset_path != 'imdb_small': train = pd.read_csv(config.dataset_path + '/6_train.csv') test = pd.read_csv(config.dataset_path + '/6_test.csv') val = pd.read_csv(config.dataset_path + '/6_val.csv') X_train, meta_train, Y_train, label_encoder_train = splitFeatures(train) X_test, meta_test, Y_test, label_encoder_test = splitFeatures(test) X_val, meta_val, Y_val, label_encoder_val = splitFeatures(val) n_classes = max(len(label_encoder_val.classes_), len(label_encoder_test.classes_), len(label_encoder_train.classes_)) elif config.dataset_path == 'imdb_small': dl = DataLoader() X_train, X_val, X_test, Y_train, Y_val, Y_test, _, _, _ = dl.load_data( data_path='../imdb_small/budgetandactors2.txt') Y_val = [1 if y == 1 else 0 for y in Y_val] Y_test = [1 if y == 1 else 0 for y in Y_test] n_classes = 2 else: dl = DataLoader() X_train, X_val, X_test, Y_train, Y_val, Y_test, _, _, _ = dl.load_data( data_path='./data/imdb/budgetandactors.txt') Y_val = [1 if y == 1 else 0 for y in Y_val] Y_test = [1 if y == 1 else 0 for y in Y_test] n_classes = 2 # print("X_val", X_val) # print("X_train", X_train) # print("Y_val", Y_val)
def main(): warnings.filterwarnings("ignore") dataset='imdb' from data.loader import DataLoader dl = DataLoader() train_primitive_matrix, val_primitive_matrix, test_primitive_matrix, \ train_ground, val_ground, test_ground, _, _, _ = dl.load_data(dataset=dataset) hg = HeuristicGenerator(train_primitive_matrix, val_primitive_matrix, val_ground, train_ground, b=0.5) hg.run_synthesizer(max_cardinality=1, idx=None, keep=3, model='dt') syn = Synthesizer(val_primitive_matrix, val_ground, b=0.5) heuristics, feature_inputs = syn.generate_heuristics('nn', 1) print("Total Heuristics Generated: ", np.shape(heuristics)[1]) optimal_betas = syn.find_optimal_beta(heuristics[0], val_primitive_matrix, feature_inputs[0], val_ground) top_idx = hg.prune_heuristics(heuristics, feature_inputs, keep=3) print('Features chosen heuristics are based on: ', top_idx) verifier = Verifier(hg.L_train, hg.L_val, val_ground, has_snorkel=False) verifier.train_gen_model() verifier.assign_marginals() feedback_idx = verifier.find_vague_points(gamma=0.1, b=0.5) print('Percentage of Low Confidence Points: ', np.shape(feedback_idx)[0] / float(np.shape(val_ground)[0])) validation_accuracy = [] training_accuracy = [] validation_coverage = [] training_coverage = [] training_marginals = [] idx = None hg = HeuristicGenerator(train_primitive_matrix, val_primitive_matrix, val_ground, train_ground, b=0.5) plt.figure(figsize=(12, 6)); for i in range(3, 26): if (i - 2) % 5 == 0: print "Running iteration: ", str(i - 2) # Repeat synthesize-prune-verify at each iterations if i == 3: hg.run_synthesizer(max_cardinality=1, idx=idx, keep=3, model='dt') else: hg.run_synthesizer(max_cardinality=1, idx=idx, keep=1, model='dt') hg.run_verifier() # Save evaluation metrics va, ta, vc, tc = hg.evaluate() validation_accuracy.append(va) training_accuracy.append(ta) training_marginals.append(hg.vf.train_marginals) validation_coverage.append(vc) training_coverage.append(tc) # Plot Training Set Label Distribution if i <= 8: plt.subplot(2, 3, i - 2) plt.hist(training_marginals[-1], bins=10, range=(0.0, 1.0)); plt.title('Iteration ' + str(i - 2)); plt.xlim([0.0, 1.0]) plt.ylim([0, 825]) # Find low confidence datapoints in the labeled set hg.find_feedback() idx = hg.feedback_idx # Stop the iterative process when no low confidence labels if idx == []: break plt.tight_layout() plt.hist(training_marginals[-1], bins=10, range=(0.0, 1.0)); plt.title('Final Distribution'); print("Program Synthesis Train Accuracy: ", training_accuracy[-1]) print("Program Synthesis Train Coverage: ", training_coverage[-1]) print("Program Synthesis Validation Accuracy: ", validation_accuracy[-1])
import warnings warnings.filterwarnings("ignore") import numpy as np from sklearn import * from lstm.imdb_lstm import * import matplotlib.pyplot as plt dataset = 'imdb' from data.loader import DataLoader dl = DataLoader() _, _, _, train_ground, val_ground, test_ground, train_text, val_text, test_text = dl.load_data( dataset=dataset) train_reef = np.load('./data/imdb_reef.npy') f1_all = [] pr_all = [] re_all = [] val_acc_all = [] bs_arr = [64, 128, 256] n_epochs_arr = [5, 10, 25] for bs in bs_arr: for n in n_epochs_arr: y_pred = lstm_simple(train_text, train_reef, val_text, val_ground,
import numpy as np import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") dataset = 'imdb' from data.loader import DataLoader dl = DataLoader() #train_primitive_matrix, val_primitive_matrix, test_primitive_matrix, \ #train_ground, val_ground, test_ground, class_count = dl.load_data_sheet() train_primitive_matrix, val_primitive_matrix, test_primitive_matrix, \ train_ground, val_ground, test_ground, _, _, _ = dl.load_data(dataset=dataset) class_count = 2 #train_primitive_matrix, val_primitive_matrix, test_primitive_matrix, \ #rain_ground, val_ground, test_ground, class_count = dl.load_data_synt(4) b = 1 / class_count from program_synthesis.heuristic_generator import HeuristicGenerator '''hg = HeuristicGenerator(train_primitive_matrix, val_primitive_matrix, val_ground, train_ground, b=b, class_count = class_count) hg.run_synthesizer(max_cardinality=1, idx=None, keep=3, model='lr') from program_synthesis.synthesizer import Synthesizer syn = Synthesizer(val_primitive_matrix, val_ground, b=b) heuristics, feature_inputs = syn.generate_heuristics('lr', 1)