def generate_idf_string(unsafe_user_inputs): cleaned_inputs = validation.Validation().cleaned_inputs(unsafe_user_inputs) unicode_idf_file = unicode( IDF(idf_list=building.Building( _geometry_configurations(cleaned_inputs), cleaned_inputs).output_EP_list())) return unicode_idf_file
def test_data(pathname, dbName, table, columns, filename, threshold): validate = validation.Validation(threshold) connect = connectBD.Connection() # dict[table][column] = DataFrame #dict_tab_col_df = validate.filterData(connect.getColumnsDB(pathname, table, columns), verbose=False) dict_tab_col_df = validate.filterData(connect.getColumnsDB2( dbName, table, columns), verbose=False) dict_tab_col_df_pred = dict() for tab in dict_tab_col_df: dict_tab_col_df_pred[tab] = dict() # Test automatic detection flag_SVM = False for tab, col_df in dict_tab_col_df.items(): for c in col_df: pred = validate.checkPattern(col_df[c], c) if pred is None: flag_SVM = True else: dict_tab_col_df_pred[tab][c] = pred # SVM predict: if at least one column needs to be predict if flag_SVM: try: classifier, vocabulary_ = load(filename) except: print("Saved model not loaded (filename as '{}')".format(filename)) print("Please, train the entire data: python3.6 {} --train".format( sys.argv[0])) exit(2) vectorizer = getVectorizer(dict_tab_col_df, vocabulary_) for tab, col_df in dict_tab_col_df.items(): for c in col_df: if c not in dict_tab_col_df_pred[tab]: df = pre_processing(col_df[c]) doc = vectorizer.transform(list(set(df))) dict_tab_col_df_pred[tab][c] = np.unique( classifier.predict(doc), return_counts=True)[0][0] return dict_tab_col_df_pred
def __init__(self, model_conf: ModelConfig): """ :param model_conf: 读取工程配置文件 """ self.model_conf = model_conf self.validation = validation.Validation(self.model_conf)
def train_data(pathname, threshold, path_out): validate = validation.Validation(threshold) bd = connectBD.Connection() # Phase 1: filtering the data print("Phase 1: Filtering the data...") # dict[database][column] = DataFrame dict_bd_column_df = validate.filterData(bd.readFilesDB(pathname)) # Phase 2: detecting pattern print("Phase 2: Detecting patterns...") y_true = list() # ground-truth classes y_pred = list() # predicted classes dict_bd_column_true = dict() # manual labeled dict_bd_column_pred = dict() # predicted by pattern # TODO: to remove - only for checking the results #dict_bd_column_examples = dict() # examples # dict[column] = class column_class = select_attributes.getAnnotatedColumns() noClass_columns = set() for bd, column_df in dict_bd_column_df.items(): dict_bd_column_true[bd] = dict() dict_bd_column_pred[bd] = dict() for column, df in column_df.items(): if column not in column_class: noClass_columns.add(column) pred = validate.checkPattern(df, column) if pred is None: pred = "nenhuma" y_pred.append(pred) dict_bd_column_pred[bd][column] = pred if column in column_class: y_true.append(column_class[column]) dict_bd_column_true[bd][column] = column_class[column] else: y_true.append("nenhuma") dict_bd_column_true[bd][column] = "nenhuma" # TODO: to remove - only for checking the results """instances = "" n_instances = 1 c = column if len(df.unique()) > 0: for value in df.unique(): instances = instances + str(value) + "#" if n_instances == 5: break n_instances = n_instances + 1 if bd not in dict_bd_column_examples: dict_bd_column_examples[bd] = dict() dict_bd_column_examples[bd][column] = instances""" # Results of the automatic pattern checking #print(classification_report(y_true, y_pred)) # Phase 3: predicting classes print("Phase 3: Trainning the classifier...") # Selecting columns that need to be trained columns_SVM = set() for cl, d in dict( classification_report(y_true, y_pred, output_dict=True, zero_division=0)).items(): if type(d) is dict: if d['f1-score'] < threshold and d['support'] != 0: columns_SVM.update(select_attributes.getAnnotatedColumn(cl)) else: break columns_SVM.update(noClass_columns) # Assigning the class of the subclasses for c, cl in column_class.items(): column_class[c] = cl if "_" not in cl else cl[:cl.index("_")] X, y, vectorizer = get_X_and_y(dict_bd_column_df, columns_SVM, column_class) classifier = build_model(X, y) y_pred, y_true = y_predict(vectorizer, classifier, dict_bd_column_df, columns_SVM, column_class) print("Saving Model and Vocabulary...") outputName = (path_out if path_out.endswith("/") else path_out + "/") + "classifier_" + datetime.now().strftime( "%d%m%Y_%H%M%S") + ".joblib" dump([classifier, vectorizer.vocabulary_], outputName) print("Objects saved as '{}'.".format(outputName)) #print(classification_report(y_true, y_pred)) index = 0 for bd, column_df in dict_bd_column_df.items(): for c in column_df: if c in columns_SVM: # If it is not a false positive, then update it if dict_bd_column_pred[bd][c] is "nenhuma": dict_bd_column_pred[bd][c] = y_pred[index] index += 1 y_true = list() # ground-truth classes y_pred = list() # predicted classes for bd, column_df in dict_bd_column_pred.items(): for c in column_df: cl = dict_bd_column_pred[bd][c] y_pred.append(cl if "_" not in cl else cl[:cl.index("_")]) cl = dict_bd_column_true[bd][c] y_true.append(cl if "_" not in cl else cl[:cl.index("_")]) """ # TODO: to remove - only for checking the results f_write = open("output_checkingData.tsv", "w") f_write.write("column\tmanual\tpredita\tnum_DB\tinstances\n") for bd,column_df in dict_bd_column_pred.items(): for c in column_df: f_write.write("{}\t{}\t{}\t{}\t{}\n".format(c,"nenhuma" if c not in column_class else column_class[c],dict_bd_column_pred[bd][c],bd,dict_bd_column_examples[bd][c])) f_write.close() """ print("Classification report:") print(classification_report(y_true, y_pred))
import time import sys import pandas as pd import validation as vd import constants as ct # Create instance of class Validation validation = vd.Validation() def get_filters(): """Take user input for a city, month, and/or day to filter data. Return: (str) city - name of the city to filter data (str) timeframe - user's choice of timeframe to filter data (str) month - name of the month to filter data (str) day - name of the day of week to filter data """ month = '' day = '' city = validation.validate_city( input( 'Would you like to see data for Chicago, New York or Washington:\n' )) timeframe = validation.validate_timeframe( input( '\nWould you like to filter the data by month, day, both or not at all? Type "none" for no filter\n' )) if timeframe == 'month':
def transfer_learning(num_epochs=3, resize=320, batch_size=16, posw=1, data_rate=1, normalize=False, feature_extract=False, pre_trained=False, pre_trained_PATH="", from_checkpoint="", root_PATH=root_PATH, learning_rate=0.0001, num_workers=5, root_PATH_dataset=root_PATH_dataset, saved_model_PATH=saved_model_PATH): #batch transformation if normalize: transform = transforms.Compose([ transforms.Resize((resize, resize)), transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, )), transforms.Lambda(lambda x: torch.cat([x, x, x], 0)) ]) else: transform = transforms.Compose([ transforms.Resize((resize, resize)), transforms.ToTensor(), transforms.Lambda(lambda x: torch.cat([x, x, x], 0)) ]) #,transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) model = models.densenet121(pretrained=pre_trained) model.classifier = torch.nn.Linear(1024, 5) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if posw: criterion = nn.BCEWithLogitsLoss( pos_weight=chexpert_load.load_posw()).to(device=device) else: criterion = nn.BCEWithLogitsLoss().to(device=device) plot_loss = [] kwarg_Common = { "num_epochs": num_epochs, "learning_rate": learning_rate, "batch_size": batch_size } kwargs = {"Common": kwarg_Common} if pre_trained_PATH or from_checkpoint: loader = load_model.Load_Model(method="TL", pre_trained=pre_trained_PATH, from_checkpoint=from_checkpoint, kwargs=kwargs, model=model, plot_loss=plot_loss, use_cuda=use_cuda) file_name, optimizer, plot_loss = loader() else: if pre_trained: print("pretrained on ImageNet ") else: print("training from scratch ") file_name = "from_scratch_epoch" + str(num_epochs) + "_batch" + str( batch_size) + "_learning_rate" + str(learning_rate) + ".tar" file_name = "data_rate" + str( data_rate) + "_" + file_name if data_rate != 1 else file_name file_name = "pre_trainedIMAGENET_" + file_name if pre_trained else file_name file_name = "no_posw_" + file_name if not posw else file_name file_name = "normalized" + file_name if normalize else file_name if feature_extract: for param in model.features.parameters(): param.requires_grad = False optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate) file_name = "FE_" + file_name saved_model_PATH = saved_model_PATH + "saved_models/transfer_learning/" + file_name[: -4] if not os.path.exists(saved_model_PATH): os.mkdir(saved_model_PATH) labels_path = root_PATH + "SummerThesis/code/custom_lib/chexpert_load/labels.pt" cheXpert_train_dataset, dataloader = chexpert_load.chexpert_load( root_PATH + "SummerThesis/code/custom_lib/chexpert_load/train.csv", transform, kwarg_Common["batch_size"], num_workers=num_workers, data_rate=data_rate, labels_path=labels_path, root_dir=root_PATH_dataset) currentDT = datetime.datetime.now() model = model.to(device=device) print("started training") print('START--', file_name) model.train() for epoch in range(num_epochs): for i, (images, labels, _) in enumerate( dataloader ): # Load a batch of images with its (index, data, class) images = images.to(device=device, dtype=torch.float) labels = labels.to(device=device, dtype=torch.float) outputs = model(images).to( device=device ) # Forward pass: compute the output class given a image loss = criterion( outputs, labels ) # Compute the loss: difference between the output class and the pre-given label optimizer.zero_grad() # Intialize the hidden weight to all zeros loss.backward() # Backward pass: compute the weight optimizer.step() # Optimizer: update the weights of hidden nodes if (i + 1) % 100 == 0: # Logging print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, num_epochs, i + 1, len(cheXpert_train_dataset) // batch_size, loss)) aftertDT = datetime.datetime.now() c = aftertDT - currentDT mins, sec = divmod(c.days * 86400 + c.seconds, 60) print(mins, "mins ", sec, "secs") if i % 200 == 0: plot_loss.append(loss) #DELETEEEEE #break #break aftertDT = datetime.datetime.now() c = aftertDT - currentDT mins, sec = divmod(c.days * 86400 + c.seconds, 60) print(mins, "mins ", sec, "secs") print('END--', file_name) # Calculating valid error plotting AUC , Precisinon -Recall , plot loss , saving figures, printingg auc differences PATH = saved_model_PATH + "/" + file_name val = validation.Validation(chexpert_load=chexpert_load, model=model, plot_loss=plot_loss, bs=16, transform=transform, root_PATH=root_PATH, root_PATH_dataset=root_PATH_dataset, saved_model_PATH=saved_model_PATH, file_name=file_name, gpu=use_cuda) val() torch.save( { 'epoch': num_epochs, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': plot_loss }, PATH)
def test_validatePassword_uppercase(self): validation = v.Validation() self.assertTrue(validation.validatePassword('Testcase123?')) self.assertFalse(validation.validatePassword('testcase123?'))
def test_validateLogin(self): validation = v.Validation() self.assertTrue(validation.validateLogin('Testcase123')) self.assertFalse(validation.validateLogin('Testca'))
def test_validatePasswordMatch(self): validation = v.Validation() self.assertTrue( validation.validatePasswordMatch('Testcase123?', 'Testcase123?')) self.assertFalse( validation.validatePasswordMatch('Testcase123?', 'testcase123?'))
def test_validatePassword_special(self): validation = v.Validation() self.assertTrue(validation.validatePassword('Testcase123?')) self.assertFalse(validation.validatePassword('Testcase123'))