models, optimizers, _, _, opt = model_utils.load_model(opt.model_name, opt) enc = models['enc'] dec = models['dec'] enc.train(False) dec.train(False) models = None optimizers = None print('Done loading model.') embeddings_ref_path = model_struct_dir + os.sep + 'embeddings.pkl' embeddings_ref = model_utils.load_embeddings(embeddings_ref_path) embeddings_struct_path = model_struct_dir + os.sep + 'embeddings.pkl' embeddings_struct = model_utils.load_embeddings(embeddings_struct_path) opt.batch_size = args.batch_size gpu_id = opt.gpu_ids[0] ndat = dp.get_n_dat('train') nlabels = dp.get_n_classes() img_paths_all = list() err_save_paths = list() label_names_all = list() class_list = np.arange(0, nlabels)
enc = models['enc'] dec = models['dec'] enc.train(False) dec.train(False) models = None optimizers = None print('Done loading model.') # Get the embeddings for the structure localization opt.batch_size = 100 embeddings_path = opt.save_dir + os.sep + 'embeddings_struct.pyt' embeddings = model_utils.load_embeddings(embeddings_path, enc, dp, opt) print('Done loading embeddings.') ####### ### Main Loop ####### import pdb from aicsimage.io import omeTifWriter from imgToProjection import imgtoprojection from IPython.core.display import display import PIL.Image import matplotlib.pyplot as plt import scipy.misc
logger.add((epoch, this_iter) + errors + (deltaT, )) if model_utils.maybe_save(epoch, epoch_next, models, optimizers, logger, zAll, dp, opt): zAll = list() ####### ### DONE TRAINING REFERENCE MODEL ####### ####### ### TRAIN STRUCTURE MODEL ####### embeddings_path = opt.save_dir + os.sep + 'embeddings.pkl' embeddings = model_utils.load_embeddings(embeddings_path, models['enc'], dp, opt) models = None optimizers = None def get_ref(self, inds, train_or_test='train'): inds = torch.LongTensor(inds) return self.embeddings[train_or_test][inds] dp.embeddings = embeddings # do this thing to bind the get_ref method to the dataprovider object import types dp.get_ref = types.MethodType(get_ref, dp)
df_data = dp.csv_data df_data = df_data.merge( df_mito, on=['inputFolder', 'inputFilename', 'outputThisCellIndex'], how='left') df_data = df_data.rename(columns={'MitosisLabel_y': 'MitosisLabel'}) df_data_labeled = df_data[~np.isnan(df_data['MitosisLabel'])] labels = df_data_labeled['MitosisLabel'] # labels[labels > 2] = 3 ulabels = np.unique(labels) embeddings_shape = model_utils.load_embeddings(ref_dir + os.sep + 'embeddings.pkl') use_train_or_test = 'test' df_train = df_data.iloc[dp.data[use_train_or_test]['inds']] # df_train['MitosisLabel'][df_train['MitosisLabel']>2] = 3 positions = list() for label in ulabels: label_inds = np.where(label == df_train['MitosisLabel']) embeddings = embeddings_shape[use_train_or_test][label_inds].numpy() D = squareform(pdist(embeddings, metric='cityblock')) positions.append(embeddings[np.argmin(np.sum(D, axis=0))]) # positions.append(np.mean(embeddings,axis=0)) positions = np.vstack(positions)
def main(args=None): args = parser.parse_args(args) if args.model not in MODEL_TYPES: raise ValueError('Invalid model type') if not args.model_name: raise ValueError('Model name must be provided') embeddings = load_embeddings('../embeddings/') with open('../ontology/java_utils_main.json') as json_file: java_utils = json.load(json_file) with open('../ontology/java_utils_methods.json') as json_file: java_utils_methods = json.load(json_file) with open(FULL_DATASET) as json_file: data = json.load(json_file) train_associations = load_data_from_object(data['train']) test_associations = load_data_from_object(data['test']) valid_associations = load_data_from_object(data['valid']) deleted_associations = load_data_from_object(data['deleted']) if args.model == 'more_data_feedforward' or args.model == 'more_data_crf': if not args.delete_size or int( args.delete_size) > len(deleted_associations): raise ValueError('Delete size must be provided. Max size is ' + str(len(deleted_associations))) train_associations = train_associations + deleted_associations[ 0:int(args.delete_size)] with open(FULL_ANNOTATIONS) as json_file: annotations = json.load(json_file) if not args.oracle: # Use annotated data for evaluation. for association in test_associations: annotation_id = association.annotation_id for candidate in association.full_code_representation: token = candidate['token'] line_number = candidate['line_idx'] position = candidate['pos_idx'] key = token + '-' + str(line_number) + '-' + str(position) if key in annotations[str(annotation_id)]: if annotations[str(annotation_id)][key] == 'True': candidate['is_associated'] = True else: candidate['is_associated'] = False if 'crf' in args.model: process_crf_dataset(train_associations, embeddings, java_utils, java_utils_methods) process_crf_dataset(test_associations, embeddings, java_utils, java_utils_methods) process_crf_dataset(valid_associations, embeddings, java_utils, java_utils_methods) elif 'baseline' not in args.model: process_dataset(train_associations, embeddings, java_utils, java_utils_methods) process_dataset(test_associations, embeddings, java_utils, java_utils_methods) process_dataset(valid_associations, embeddings, java_utils, java_utils_methods) print "Train: " + str(len(train_associations)) print "Test: " + str(len(test_associations)) print "Valid: " + str(len(valid_associations)) if args.model == 'crf' or args.model == 'more_data_crf': model = CRFModel(train_associations, test_associations, valid_associations, args) elif args.model == 'feedforward' or args.model == 'more_data_feedforward': model = FeedForwardNN(train_associations, test_associations, valid_associations, args) elif args.model == 'subtoken_matching_baseline': model = SubtokenMatchingBaseline(train_associations, test_associations, valid_associations) elif args.model == 'return_line_baseline': model = ReturnLineBaseline(train_associations, test_associations, valid_associations) elif args.model == 'random_baseline': model = RandomBaseline(train_associations, test_associations, valid_associations) elif args.model == 'majority_class_random_baseline': model = MajorityClassRandomBaseline(train_associations, test_associations, valid_associations) else: raise ValueError('Unable to identify model type') print("Evaluation:") print("------------------") print("Train:") model.classify(train_associations) print("------------------") print("Valid:") model.classify(valid_associations) print("------------------") print("Test:") model.classify(test_associations) print("------------------") sys.stdout.flush() if args.v: with open(OUTPUT_FILE, 'w+') as f: for association in test_associations: f.write("NP: " + association.np_chunks[0] + '\n') f.write("Comment line: " + association.comment_line + '\n\n') f.write(association.full_code.encode('utf-8') + '\n\n') predicted = [ str(c['token']) for c in association.full_code_representation if c['prediction'] == 1 ] gold = [ str(c['token']) for c in association.full_code_representation if c['is_associated'] ] f.write("Predicted: " + str(predicted) + '\n\n') f.write("Gold: " + str(gold) + '\n\n') f.write( "Candidates: " + str([str(c) for c in association.candidate_code_tokens]) + '\n') f.write('***************************\n\n')