Пример #1
0
    def getTrainedModel1(self):

        # We build a matrix of LF votes for each comment ticket
        LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs)

        # Get true labels for LF set
        Y_LF_set = np.array(self.LF_set['resolution'])

        display(
            lf_summary(sparse.csr_matrix(LF_matrix),
                       Y=Y_LF_set,
                       lf_names=self.LF_names.values()))

        print("label coverage: " + label_coverage(LF_matrix))

        mv = MajorityLabelVoter()
        Y_train_majority_votes = mv.predict(LF_matrix)
        print("classification report:\n" +
              classification_report(Y_LF_set, Y_train_majority_votes))

        Ls_train = self.make_Ls_matrix(self.train, self.LFs)

        # You can tune the learning rate and class balance.
        model = LabelModel(k=2, seed=123)
        trainer = model.train_model(Ls_train,
                                    n_epochs=2000,
                                    print_every=1000,
                                    lr=0.0001,
                                    class_balance=np.array([0.2, 0.8]))

        Y_train = model.predict(Ls_train) + Y_LF_set

        print('Trained Label Model Metrics:')
        scores = model.score((Ls_train[1], Y_train[1]),
                             metric=['accuracy', 'precision', 'recall', 'f1'])
        print(scores)

        return trainer, Y_train
Пример #2
0
def train_model(args):

    #global args
    #args = parser.parse_args()

	hidden_size = 128 
	num_classes = 2
	encode_dim = 1000 # using get_frm_output_size()

	L,Y = load_labels(args) 

	# Label Model
	# labelling functions analysis
	print(lf_summary(L["dev"], Y = Y["dev"]))

	# training label model
	label_model = LabelModel(k=num_classes, seed=123)
	label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50)

	# evaluating label model
	print('Trained Label Model Metrics:')
	label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	# comparison with majority vote of LFs
	mv = MajorityLabelVoter(seed=123)
	print('Majority Label Voter Metrics:')
	mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	Ytrain_p = label_model.predict_proba(L["train"])
	#print(Ytrain_ps.shape) #(377*50,2)
	#Ydev_p = label_model.predict_proba(L["dev"])

	# test models
	#label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

	# End Model
	# Create datasets and dataloaders
	train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
	data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers)
	#print(len(data_loader["train"])) # 18850 / batch_size
	#print(len(data_loader["dev"])) # 1500 / batch_size
	#print(len(data_loader["test"])) # 1000 / batch_size 
	#import ipdb; ipdb.set_trace()

	# Define input encoder
	cnn_encoder = FrameEncoderOC

	if(torch.cuda.is_available()):
		device = 'cuda'
	else:
		device = 'cpu'
	#import ipdb; ipdb.set_trace()

	# Define LSTM module
	lstm_module = LSTMModule(
		encode_dim,
		hidden_size,
		bidirectional=False,
		verbose=False,
		lstm_reduction="attention",
		encoder_class=cnn_encoder,
		)

	# Define end model
	end_model = EndModel(
		input_module=lstm_module,
		layer_out_dims=[hidden_size, num_classes],
		optimizer="adam",
		#use_cuda=cuda,
		batchnorm=True,
		seed=123,
		verbose=False,
		device = device,
		)

	#print('Training model')
	#tic = time.time()
	
	dropout = 0.4
	# Train end model
	end_model.train_model(
		train_data=data_loader["train"],
		valid_data=data_loader["dev"],
		l2=args.weight_decay,
		lr=args.lr,
		n_epochs=args.n_epochs,
		log_train_every=1,
		verbose=True,
		progress_bar = True,
		loss_weights = [0.45,0.55],
		batchnorm = 'True',
		input_dropout = dropout,
		middle_dropout = dropout,
		#validation_metric='f1',
		)

	#print('Time taken for training:')
	#print(time.time() - tic)

	# evaluate end model
	end_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
Пример #3
0
def train_model(args):

    #global args
    #args = parser.parse_args()

    hidden_size = 128
    num_classes = 2
    encode_dim = 108  # using get_frm_output_size()

    if (torch.cuda.is_available()):
        device = torch.device('cuda:0')
        #device = 'cuda'
    else:
        device = 'cpu'

    #print(device)
    L, Y = load_labels(args)

    # Label Model
    # labelling functions analysis
    print(lf_summary(L["dev"], Y=Y["dev"]))

    # majority vote of LFs
    mv = MajorityLabelVoter(seed=123)
    print('Majority Label Voter Metrics:')
    mv.score((L["dev"], Y["dev"]),
             metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model - no temporal modelling
    label_model = LabelModel(k=num_classes, seed=123)
    label_model.train_model(L["train"],
                            Y["dev"],
                            n_epochs=500,
                            log_train_every=50)

    # evaluating label model
    print('Trained Label Model Metrics:')
    label_model.score((L["dev"], Y["dev"]),
                      metric=['accuracy', 'precision', 'recall', 'f1'])

    # training label model without temporal modelling
    # naive model
    #print(L["train"].todense().shape) # (18850,5)
    #print(L["dev"].todense().shape) # (1500,5)
    #print(Y["dev"].shape) # (1500,)
    m_per_task = L["train"].todense().shape[1]  # 5
    MRI_data_naive = {
        'Li_train':
        torch.FloatTensor(np.array(L["train"].todense().astype('int_'))),
        'Li_dev':
        torch.FloatTensor(np.array(L["dev"].todense())),
        'R_dev':
        Y["dev"]
    }

    MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device)

    # training naive model
    naive_model = DPLabelModel(
        m=m_per_task,
        T=1,
        edges=[],
        coverage_sets=[[
            0,
        ]] * m_per_task,
        mu_sharing=[[
            i,
        ] for i in range(m_per_task)],
        phi_sharing=[],
        device=device,
        #class_balance=MRI_data_naive['class_balance'],
        seed=0)

    optimize(naive_model,
             L_hat=MRI_data_naive['Li_train'],
             num_iter=300,
             lr=1e-3,
             momentum=0.8,
             clamp=True,
             seed=0)

    # evaluating naive model
    R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy()
    R_pred = 2 - R_pred
    #print(R_pred)
    #print(MRI_data_naive['R_dev'])

    for metric in ['accuracy', 'f1', 'recall', 'precision']:
        score = metric_score(MRI_data_naive['R_dev'], R_pred, metric)
        print(f"{metric.capitalize()}: {score:.3f}")

    # training label model with temporal modelling
    # reshaping dataset
    num_frames = 50
    n_patients_train = round(L["train"].todense().shape[0] /
                             num_frames)  #(377)
    n_patients_dev = round(L["dev"].todense().shape[0] / num_frames)  #(30)
    Ltrain = np.reshape(np.array(L["train"].todense()),
                        (n_patients_train, num_frames, -1))
    Ldev = np.reshape(np.array(L["dev"].todense()),
                      (n_patients_dev, num_frames, -1))
    Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames))
    # print(Ltrain.shape) # (377,50,5)
    #print(Ldev.shape) # (30,50,5)
    #print(Ydev.shape) # (30,50)

    # subsampling
    # selecting frames 3,13,23,33,43
    indices = np.linspace(2, 42, 5).astype(int)
    m_per_task = 5
    T = 5

    Ltrain_small = Ltrain[:, indices, :]  # shape (377,5,5)
    Ldev_small = Ldev[:, indices, :]  # shape (30,5,5)
    Ydev_small = Ydev[:, indices]  # shape (30,5)

    Ltrain_small = np.reshape(
        Ltrain_small, ((n_patients_train * T), m_per_task))  # shape (1885,5)
    Ldev_small = np.reshape(
        Ldev_small, ((n_patients_dev * T), m_per_task))  # shape (150,5)
    Ydev_small = np.reshape(Ydev_small,
                            ((n_patients_dev * T), ))  # shape (150,)

    MRI_data_temporal = {
        'Li_train':
        torch.LongTensor(Ltrain_small).view(n_patients_train,
                                            (m_per_task * T)),
        'Li_dev':
        torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)),
        'R_dev':
        torch.LongTensor(Ydev_small)[::T] * (2**T - 1),
        'm':
        m_per_task * T,
        'T':
        T
    }

    MRI_data_temporal['class_balance'] = normalize(
        (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange(
            2**T, device=device).unsqueeze(0)).sum(0).float(),
        dim=0,
        p=1)

    max_seed = 10
    temporal_models = [
        None,
    ] * max_seed
    for seed in range(max_seed):
        markov_model = DPLabelModel(
            m=m_per_task * T,
            T=T,
            edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)],
            coverage_sets=[[
                t,
            ] for t in range(T) for _ in range(m_per_task)],
            mu_sharing=[[t * m_per_task + i for t in range(T)]
                        for i in range(m_per_task)],
            phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i)
                          for t in range(T - 1)] for i in range(m_per_task)],
            device=device,
            class_balance=MRI_data_temporal['class_balance'],
            seed=seed)
        optimize(markov_model,
                 L_hat=MRI_data_temporal['Li_train'],
                 num_iter=1000,
                 lr=1e-5,
                 momentum=0.8,
                 clamp=True,
                 verbose=False,
                 seed=seed)
        temporal_models[seed] = markov_model

    for seed, model in enumerate(temporal_models):
        R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu())
        F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0,
                          R_pred.cpu() > 0, 'f1')
        accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(),
                                'accuracy')
        print(f"seed={seed}  accuracy={accuracy:.3f}  F1={F1:.3f}")
    list(acc_df.transpose().index),
    acc_df.transpose()[0],
    "bo-",
    label="DaG",
)
plt.legend()

# In[23]:

label_model.train_model(correct_L[:, 0:7],
                        n_epochs=1000,
                        print_every=200,
                        seed=100,
                        lr=0.01,
                        l2=1.08)
label_model.score((correct_L_train[:, 0:7], candidate_dfs['train']
                   ['curated_dsh'].apply(lambda x: 1 if x > 0 else 2).values))

# In[24]:

lf_stats = zip(lf_names,
               range(0,
                     label_model.mu.detach().clone().numpy().shape[0], 2))
estimated_param = pd.np.clip(label_model.mu.detach().clone().numpy(), 0.01,
                             0.99)
value_type = ["P(L=1|Y=1)", "P(L=1|Y=2)", "P(L=2|Y=1)", "P(L=2|Y=2)"]
data = []

for lf_name, lf_index in lf_stats:
    data += list(
        zip([lf_name] * len(value_type),
            estimated_param[lf_index:lf_index + 2, :].flatten(), value_type))
Пример #5
0
balance2 = Counter(Y_test).values()

new_balance = []
for elem in balance:
    new_balance.append(elem[1] / sum(balance2))
print(sorted(Counter(Y_test).items()))
print(balance)
print(new_balance)

label_model = LabelModel(k=2, seed=123)
label_model.train_model(Ls[0],
                        class_balance=new_balance,
                        n_epochs=500,
                        log_train_every=50)

score = label_model.score((Ls[1], Ys[1]))

print('Trained Label Model Metrics:')
scores = label_model.score((Ls[1], Ys[1]),
                           metric=['accuracy', 'precision', 'recall', 'f1'])

mv = MajorityLabelVoter(seed=123)
print('Majority Label Voter Metrics:')
scores = mv.score((Ls[1], Ys[1]),
                  metric=['accuracy', 'precision', 'recall', 'f1'])

Y_train_ps = label_model.predict_proba(Ls[0])

Y_dev_p = label_model.predict(Ls[1])
"""
mv2 = MajorityClassVoter()
Пример #6
0
def train_model(args):

    #global args
    #args = parser.parse_args()

	hidden_size = 128 
	num_classes = 2
	encode_dim = 1000 # using get_frm_output_size()

	L,Y = load_labels(args) 

	# Label Model
	# labelling functions analysis
	print(lf_summary(L["dev"], Y = Y["dev"]))

	# training label model
	label_model = LabelModel(k=num_classes, seed=123)
	label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50)

	# evaluating label model
	print('Trained Label Model Metrics:')
	label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	# comparison with majority vote of LFs
	mv = MajorityLabelVoter(seed=123)
	print('Majority Label Voter Metrics:')
	mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

	Ytrain_p = label_model.predict_proba(L["train"])
	#print(Ytrain_ps.shape) #(377*50,2)
	#Ydev_p = label_model.predict_proba(L["dev"])

	# test models
	#label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

	# End Model
	# Create datasets and dataloaders
	train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
	data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers)
	#print(len(data_loader["train"])) # 18850 / batch_size
	#print(len(data_loader["dev"])) # 1500 / batch_size
	#print(len(data_loader["test"])) # 1000 / batch_size 
	#import ipdb; ipdb.set_trace()

	# Define input encoder
	cnn_encoder = FrameEncoderOC

	if(torch.cuda.is_available()):
		device = 'cuda'
	else:
		device = 'cpu'
	#import ipdb; ipdb.set_trace()

	# Define LSTM module
	lstm_module = LSTMModule(
		encode_dim,
		hidden_size,
		bidirectional=False,
		verbose=False,
		lstm_reduction="attention",
		encoder_class=cnn_encoder,
		)

	train_args = [data_loader["train"]]

	train_kwargs = {
	'seed':args.seed,
	'progress_bar':True,
	'log_train_every':1}

	init_args = [
	[hidden_size, num_classes]
	]

	init_kwargs = {
	"input_module": lstm_module, 
	"optimizer": "adam",
	"verbose": False,
	"input_batchnorm": True,
	"use_cuda":torch.cuda.is_available(),
	'checkpoint_dir':args.checkpoint_dir,
	'seed':args.seed,
	'device':device}
	
	search_space = {
	'n_epochs':[10],
	'batchnorm':[True],
	'dropout': [0.1,0.25,0.4],
	'lr':{'range': [1e-3, 1e-2], 'scale': 'log'}, 
	'l2':{'range': [1e-5, 1e-4], 'scale': 'log'},#[ 1.21*1e-5],
	#'checkpoint_metric':['f1'],
	}	
	
	log_config = {
	"log_dir": "./run_logs", 
	"run_name": 'cnn_lstm_oc'
	}

	max_search = 5
	tuner_config = {"max_search": max_search }

	validation_metric = 'accuracy'

	# Set up logger and searcher
	tuner = RandomSearchTuner(EndModel, 
	**log_config,
	log_writer_class=TensorBoardWriter,
	validation_metric=validation_metric,
	seed=1701)
	
	disc_model = tuner.search(
	search_space,
	valid_data = data_loader["dev"],
	train_args=train_args,
	init_args=init_args,
	init_kwargs=init_kwargs,
	train_kwargs=train_kwargs,
	max_search=tuner_config["max_search"],
	clean_up=False,
	)

	# evaluate end model
	disc_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
Пример #7
0
val_ground = remap_labels(loader.val_ground)
L_train_sparse = sparse.csc_matrix(
    (remap_labels(L_train_sparse.data), L_train_sparse.indices,
     L_train_sparse.indptr)).T
L_val_sparse = sparse.csc_matrix((remap_labels(L_val_sparse.data),
                                  L_val_sparse.indices, L_val_sparse.indptr)).T

print('\n\n####### Running METAL Label Model ########')
label_model = LabelModel()
label_model.train_model(L_train_sparse,
                        n_epochs=200,
                        print_every=50,
                        seed=123,
                        verbose=False)
train_marginals = label_model.predict_proba(L_train_sparse)
label_model.score((L_train_sparse, train_ground), metric=metrics)

####### METAL with Exact Class Balance ########
print(
    '\n\n####### Running METAL Label Model with exact class balance ########')
train_class_balance = np.array([
    np.sum(train_ground == 1) / loader.train_num,
    np.sum(train_ground == 2) / loader.train_num
])
val_class_balance = np.array([
    np.sum(val_ground == 1) / loader.val_num,
    np.sum(val_ground == 2) / loader.val_num
])
print('Train set class balance:', train_class_balance)
print('Val set class balance:', val_class_balance)
label_model2 = LabelModel(seed=123)
Пример #8
0
def train_model(args):

    #global args
    #args = parser.parse_args()

    hidden_size = 128
    num_classes = 2
    encode_dim = 1000  # using get_frm_output_size()

    L, Y = load_labels(args)

    # Label Model
    # labelling functions analysis
    print(lf_summary(L["dev"], Y=Y["dev"]))

    # training label model
    label_model = LabelModel(k=num_classes, seed=123)
    label_model.train_model(L["train"],
                            Y["dev"],
                            n_epochs=2000,
                            log_train_every=100)

    # evaluating label model
    print('Trained Label Model Metrics:')
    label_model.score((L["dev"], Y["dev"]),
                      metric=['accuracy', 'precision', 'recall', 'f1'])

    # comparison with majority vote of LFs
    mv = MajorityLabelVoter(seed=123)
    print('Majority Label Voter Metrics:')
    mv.score((L["dev"], Y["dev"]),
             metric=['accuracy', 'precision', 'recall', 'f1'])

    Ytrain_p = label_model.predict_proba(L["train"])
    #print(Ytrain_ps.shape) #(377*50,2)
    #Ydev_p = label_model.predict_proba(L["dev"])

    # test models
    #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1'])

    # End Model
    # Create datasets and dataloaders
    train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"])
    data_loader = get_data_loader(train, dev, test, args.batch_size,
                                  args.num_workers)
    #print(len(data_loader["train"])) # 18850 / batch_size
    #print(len(data_loader["dev"])) # 1500 / batch_size
    #print(len(data_loader["test"])) # 1000 / batch_size
    #import ipdb; ipdb.set_trace()

    # Define input encoder
    #cnn_encoder = FrameEncoderOC
    cnn_encoder = FrameEncoderOCDense

    if (torch.cuda.is_available()):
        device = 'cuda'
    else:
        device = 'cpu'
    #import ipdb; ipdb.set_trace()

    # Define LSTM module
    lstm_module = LSTMModule(
        encode_dim,
        hidden_size,
        bidirectional=False,
        verbose=False,
        lstm_reduction=args.lstm_reduction,
        encoder_class=cnn_encoder,
        encoder_kwargs={"requires_grad": args.requires_grad})
    '''
	# Define end model
	end_model = EndModel(
		input_module=lstm_module,
		layer_out_dims=[hidden_size, num_classes],
		optimizer="adam",
		#use_cuda=cuda,
		batchnorm=False,
		seed=args.seed,
		verbose=False,
		device = device,
		)
	'''

    init_kwargs = {
        "layer_out_dims": [hidden_size, num_classes],
        "input_module": lstm_module,
        "optimizer": "adam",
        "verbose": False,
        "input_batchnorm": False,
        "use_cuda": cuda,
        'seed': args.seed,
        'device': device
    }

    end_model = EndModel(**init_kwargs)

    if not os.path.exists(args.checkpoint_dir):
        os.mkdir(args.checkpoint_dir)

    with open(args.checkpoint_dir + '/init_kwargs.pickle', "wb") as f:
        pickle.dump(init_kwargs, f, protocol=pickle.HIGHEST_PROTOCOL)

    dropout = 0.4
    # Train end model
    end_model.train_model(
        train_data=data_loader["train"],
        valid_data=data_loader["dev"],
        l2=args.weight_decay,
        lr=args.lr,
        n_epochs=args.n_epochs,
        log_train_every=1,
        verbose=True,
        progress_bar=True,
        loss_weights=[0.55, 0.45],
        input_dropout=0.1,
        middle_dropout=dropout,
        checkpoint_dir=args.checkpoint_dir,
        #writer = "json",
        #writer_config = {
        #"log_dir":  args.log_dir,
        #"run_dir":  args.run_dir,
        #"run_name": args.run_name,
        #"writer_metrics": ['accuracy','precision', 'recall', 'f1','roc-auc','ndcg']
        #},
        #validation_metric='f1',
    )

    # evaluate end model
    print("Dev Set Performance")
    end_model.score(
        data_loader["dev"],
        verbose=True,
        metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])
    print("Test Set Performance")
    end_model.score(
        data_loader["test"],
        verbose=True,
        metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])