def main(_): print("-" * 80) if not os.path.isdir(FLAGS.output_dir): print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) os.makedirs(FLAGS.output_dir) elif FLAGS.reset_output_dir: print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) shutil.rmtree(FLAGS.output_dir) os.makedirs(FLAGS.output_dir) print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout") print("Logging to {}".format(log_file)) sys.stdout = Logger(log_file) utils.print_user_flags() train()
def Eval_NN(): print("-" * 80) if not os.path.isdir(FLAGS.output_dir): print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) os.makedirs(FLAGS.output_dir) elif FLAGS.reset_output_dir: print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) shutil.rmtree(FLAGS.output_dir) os.makedirs(FLAGS.output_dir) print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout") print("Logging to {}".format(log_file)) sys.stdout = Logger(log_file) utils.print_user_flags() ''' # below are for batch evaluation of all arcs defined in the structure_path if not FLAGS.structure_path: exit() with open(FLAGS.structure_path, 'r') as fp: lines = fp.readlines() lines = [eval(line.strip()) for line in lines] structures = [] for line in lines: row = [] for ele in line: row += ele structures.append(row) n = len(lines) # eval the first structure Acc = [] eva = Eval() eva.eval(structures[0]) eva.eval(structures[1]) acc = eva.eval(structures[0]) print(acc) pdb.set_trace() ''' eva = Eval() return eva
def main(_): print("-" * 80) if not os.path.isdir(FLAGS.output_dir): print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) os.makedirs(FLAGS.output_dir) elif FLAGS.reset_output_dir: print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) shutil.rmtree(FLAGS.output_dir) os.makedirs(FLAGS.output_dir) print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout") print("Logging to {}".format(log_file)) sys.stdout = Logger(log_file) utils.print_user_flags() model_file = os.path.join(FLAGS.output_dir, "models.csv") if FLAGS.child_fixed_arc is None: with open(model_file, 'a+') as f: headers = ['num_layers', 'accuracy', 'models_arc'] writer = csv.DictWriter(f, headers, delimiter=',', lineterminator='\n') writer.writeheader() for i in range(FLAGS.search_from, FLAGS.child_num_layers + 1): tf.compat.v1.logging.info( "Searching with constraint, num_layers: %d" % i) map_task = train(i) for k, v in map_task.items(): writer.writerow({ 'num_layers': i, 'accuracy': k, 'models_arc': v }) f.flush() else: _ = train(FLAGS.child_num_layers)
import os from src import config from src.utils import IndexDatabase, InvertedIndexBuilder, Logger if __name__ == '__main__': logger = Logger().get_logger(__name__) doc_dir = os.getenv('DOCUMENTS_PATH', config.DOCUMENTS_PATH) try: index = InvertedIndexBuilder(doc_dir) except Exception: logger.exception('Error building index.') else: IndexDatabase().write_index(index.get(), index.total_docs_count) IndexDatabase().insert_api_keys(config.TOKENS)
import abc import tensorflow as tf from src.utils import Logger, __fn__, mkdir, filter_params, pickle_dump, pickle_load import numpy as np import sys logger = Logger(__fn__()) class BaseModel(object, metaclass=abc.ABCMeta): NAME = 'BaseModel' TENSORS = dict(loss='Loss/LOSS', regularizer='Loss/REGL', acc3='Evaluation/ACC3', pred='Output/PRED', alpha='Attention/ALPHA', X='X', asp='asp', lx='lx', y='y', dropout_keep='dropout_keep') OPS = dict(train_op='TrainOp/TRAIN_OP') OPTIMIZERS = dict(adagrad=tf.train.AdagradOptimizer, adam=tf.train.AdamOptimizer, sgd=tf.train.GradientDescentOptimizer, momentum=tf.train.MomentumOptimizer, rmsprop=tf.train.RMSPropOptimizer)
# Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size, dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device) optimizer = optim.Adam(net.parameters(), lr=LR) loss_function = OZELoss(alpha=0.3) logger = Logger(f'logs/training.csv', params=['loss']) with tqdm(total=EPOCHS) as pbar: # Fit model loss = fit(net, optimizer, loss_function, dataloader_train, dataloader_val, epochs=EPOCHS, pbar=pbar, device=device) # Log logger.log(loss=loss)
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) # Start search n_steps = np.prod( [len(search_range) for search_range in search_params.values()]) logger = Logger('search_log.csv', search_params) with tqdm(total=n_steps * EPOCHS) as pbar: for params in itertools.product(*search_params.values()): params = { key: params[idx] for idx, key in enumerate(search_params.keys()) } pbar.set_postfix(params) # Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input=d_input, d_output=d_output, dropout=dropout, chunk_mode=chunk_mode, pe=pe,
reduction='none', occupation=occupation), 'r2_tint': lambda y_true, y_pred: np.array([ r2_score(y_true[:, i, -1], y_pred[:, i, -1]) for i in range(y_true.shape[1]) ]), 'r2_cold': lambda y_true, y_pred: np.array([ r2_score(y_true[:, i, 0:-1], y_pred[:, i, 0:-1]) for i in range(y_true.shape[1]) ]) } logger = Logger( f'logs/training.csv', model_name=net.name, params=[y for key in metrics.keys() for y in (key, key + '_std')]) # Fit model with tqdm(total=EPOCHS) as pbar: loss = fit(net, optimizer, loss_function, dataloader_train, dataloader_val, epochs=EPOCHS, pbar=pbar, device=device) # Switch to evaluation _ = net.eval()
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] staining_train_dataset = StainingDataset(dataset_dir=opt.dataroot, transform=transforms_, unaligned=True) dataset_train_loader = DataLoader(staining_train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=opt.n_cpu) #dataloader = DataLoader(ImageDataset(opt.dataroot, transforms_=transforms_, unaligned=True), # batch_size=opt.batchSize, shuffle=True, num_workers=opt.n_cpu) print('Train Model') im_per_epoch = 10 # Loss plot logger = Logger(opt.n_epochs, im_per_epoch) ################################### # ###### Training ###### for epoch in range(opt.epoch, opt.n_epochs): for i, batch in enumerate(dataset_train_loader): # Set model input real_A = Variable(input_A.copy_(batch['HE_image'])) real_B = Variable(input_B.copy_(batch['C4D_image'])) ###### Generators A2B and B2A ###### optimizer_G.zero_grad() # Identity loss
d_input = 38 # From dataset d_output = 8 # From dataset # Config device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device {device}") # Load dataset ozeDataset = OzeDataset(DATASET_PATH) # Load network # Load transformer with Adam optimizer and MSE loss function loss_function = OZELoss(alpha=0.3) logger = Logger(f'logs/crossvalidation_log.csv', params=['loss']) kfoldIterator = kfold(ozeDataset, n_chunk=CHUNKS, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) with tqdm(total=CHUNKS * EPOCHS) as pbar: for dataloader_train, dataloader_val in kfoldIterator: # Load transformer with Adam optimizer and MSE loss function # net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size, # dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device) net = BiGRU(d_input, d_model, d_output,
import os import cv2 from src.utils import Logger, Visualizer logger = Logger.get_logger('VideoProcessor') class VideoProcessor(object): def __init__(self, path, score_fn, annotated_path): self.score_fn = score_fn self.annotated_path = annotated_path self.visualizer = Visualizer() if not os.path.exists(path): raise IOError('file %s does not exist'.format(path)) self.capture = cv2.VideoCapture(path) if os.path.exists(annotated_path): os.remove(annotated_path) self.writer = cv2.VideoWriter(annotated_path, cv2.VideoWriter_fourcc(*'XVID'), 50.0, (640, 360)) while not self.capture.isOpened(): cv2.waitKey(1000) logger.debug('Wait for header') def start(self, max_frame_num=2 << 32, fps=1000): num_frames = min(int(self.capture.get(cv2.CAP_PROP_FRAME_COUNT)),
# for testing purposes: val_data = (val_data[0][:5000], val_data[1][:5000]) print('WARNING: only using 500 points for validation') # test_data = (test_data[0][:500], test_data[1][:500]) print('POLICY: ',args.policy) # this is the policy by which one should choose acquisition functions policy = policy_parser(args.policy, args) # this is the reward that is calculated based on previous acc/val # and current acc/val reward_process = RewardProcess(args.reward) # logger to record experiments logger = Logger(experiment_name=args.policy, folder=args.folder) logger.save_args(args) print('Saving to ', logger.save_folder) print('Starting Experiment') """ GET INITIAL ESTIMATE OF VALIDATION ACCURACY """ model = cnn(input_shape=x_train.shape[1:], output_classes=n_classes, bayesian= args.model == 'bayesian', train_size=x_train.shape[0], weight_constant=weight_constant)
def main(_): # Prepare directory pdb.set_trace() print("-" * 80) if not os.path.isdir(FLAGS.output_dir): print("Path {} does not exist. Creating.".format(FLAGS.output_dir)) os.makedirs(FLAGS.output_dir) elif FLAGS.reset_output_dir: print("Path {} exists. Remove and remake.".format(FLAGS.output_dir)) shutil.rmtree(FLAGS.output_dir) os.makedirs(FLAGS.output_dir) # Redirect stdout1 -------------------------------------------------------------------------------------------- print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout1") if not os.path.exists(log_file): os.mknod(log_file) print("Logging to {}".format(log_file)) sys.stdout = Logger(log_file) utils.print_user_flags() print('Reserving gpu memory...') tf.Session() # Load pickles file print('Loading pickled file...') with open('/home/yuwei/projects/vincent/pickleRick/allCrops1.pkl') as p_crop: allCrops1 = cPickle.load(p_crop) with open('/home/yuwei/projects/vincent/pickleRick/allCrops2.pkl') as p_crop: allCrops2 = cPickle.load(p_crop) with open('/home/yuwei/projects/vincent/pickleRick/labels.pkl','r') as p_crop: labels1 = cPickle.load(p_crop) labels2 = cPickle.load(p_crop) labels3 = cPickle.load(p_crop) labels1_Brio1 = cPickle.load(p_crop) labels1_Brio2 = cPickle.load(p_crop) labels2_Brio1 = cPickle.load(p_crop) # Prepare and divide data autoTrainNN = AutoTrain() combined_1 = zip(allCrops1 + allCrops2, np.concatenate((labels1,labels2))) autoTrainNN.addLabelledData(combined_1) # train(autoTrainNN) # Redirect stdout2 ------------------------------------------------------------------------------------------- print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout2") if not os.path.exists(log_file): os.mknod(log_file) print("Logging to {}".format(log_file)) sys.stdout.log = open(log_file, "a") # Change log file # Load pickles file print('Loading pickled file...') with open('/home/yuwei/projects/vincent/pickleRick/allCrops3.pkl') as p_crop: allCrops3 = cPickle.load(p_crop) with open('/home/yuwei/projects/vincent/pickleRick/brio1/allCrops1.pkl') as p_crop: allCrops1_Brio1 = cPickle.load(p_crop) combined_2 = zip(allCrops3 + allCrops1_Brio1, np.concatenate((labels3,labels1_Brio1))) autoTrainNN.addLabelledData(combined_2) # train(autoTrainNN) # Redirect stdout3 -------------------------------------------------------------------------------------------- print("-" * 80) log_file = os.path.join(FLAGS.output_dir, "stdout3") if not os.path.exists(log_file): os.mknod(log_file) print("Logging to {}".format(log_file)) sys.stdout.log = open(log_file, "a") # Change log file utils.print_user_flags() # Load pickles file print('Loading pickled file...') with open('/home/yuwei/projects/vincent/pickleRick/brio2/allCrops1.pkl') as p_crop: allCrops1_Brio2 = cPickle.load(p_crop) with open('/home/yuwei/projects/vincent/pickleRick/brio1/allCrops2.pkl') as p_crop: allCrops2_Brio1 = cPickle.load(p_crop) combined_3 = zip(allCrops1_Brio2 + allCrops2_Brio1, np.concatenate((labels1_Brio2,labels2_Brio1))) autoTrainNN.addLabelledData(combined_3) train(autoTrainNN)
def train(model, optimizer, criterion, train_loader, num_epoch, device, val_loader=None, scheduler=None, save_best=True, weights_path='', model_name='best_model.pt'): """ Starts training process of the input model, using specified optimizer :param model: torch model :param optimizer: torch optimizer :param criterion: torch criterion :param train_loader: torch dataloader instance of training set :param val_loader: torch dataloader instance of validation set :param num_epoch: number of epochs to train :param device: device to train on """ loss_logger = Logger() best_loss = float('inf') for epoch in range(num_epoch): model.train() loss_logger.reset() for sample in train_loader: X, Y_true = sample['X'], sample['Y'] # transfer tensors to the current device X = X.to(device) Y_true = Y_true.to(device) # zero all gradients optimizer.zero_grad() # forward propagate Y_pred = model(X) loss = criterion(Y_pred, Y_true) loss_logger.update(loss.item()) # backprop and update the params loss.backward() optimizer.step() print(f"Epoch: {epoch} | Train loss: {loss_logger.average} |", end=" ") # evaluation of model performance on validation set loss_logger.reset() model.eval() for sample in val_loader: X = sample['X'].to(device) Y_true = sample['Y'].to(device) with torch.no_grad(): Y_pred = model(X) val_loss = criterion(Y_pred, Y_true) loss_logger.update(val_loss.item()) print(f"Val loss: {loss_logger.average}") # scheduler if scheduler: scheduler.step(loss_logger.average) # save the best model if loss_logger.average < best_loss and save_best: save_model(model, os.path.join(weights_path, model_name)) best_loss = loss_logger.average # save checkpoint save_model(model, os.path.join(weights_path, 'checkpoint.pt'))
def main(args): if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) ## Options dataset = args.dataset cluster_option = args.cluster_option data_dir = osp.join(args.data_dir, dataset) output_path = data_dir if not osp.exists(data_dir): os.makedirs(data_dir) ## plotting options plot_option_clusters_vs_lambda = args.plot_option_clusters_vs_lambda plot_option_fairness_vs_clusterE = args.plot_option_fairness_vs_clusterE plot_option_balance_vs_clusterE = args.plot_option_balance_vs_clusterE plot_option_convergence = args.plot_option_convergence # ### Data load savepath_compare = osp.join(data_dir, dataset + '.npz') if not os.path.exists(savepath_compare): X_org, demograph, K = read_dataset(dataset, data_dir) if X_org.shape[0] > 200000: np.savez_compressed(savepath_compare, X_org=X_org, demograph=demograph, K=K) else: np.savez(savepath_compare, X_org=X_org, demograph=demograph, K=K) else: datas = np.load(savepath_compare) X_org = datas['X_org'] demograph = datas['demograph'] K = datas['K'].item() log_path = osp.join(data_dir, cluster_option + '_log.txt') sys.stdout = Logger(log_path) # Scale and Normalize Features X_org = scale(X_org, axis=0) X = normalizefea(X_org) N, D = X.shape print('Cluster number for dataset {} is {}'.format(dataset, K)) V_list = [np.array(demograph == j) for j in np.unique(demograph)] V_sum = [x.sum() for x in V_list] print('Balance of the dataset {}'.format(min(V_sum) / max(V_sum))) print('Number of points in the dataset {}'.format(N)) # J = len(V_sum) # demographic probability for each V_j u_V = [x / N for x in V_sum] #proportional print('Demographic-probabilites: {}'.format(u_V)) print('Demographic-numbers per group: {}'.format(V_sum)) ############################################################################# ######################## Run Fair clustering ################################# ############################################################################# # fairness = True # Setting False only runs unfair clustering elapsetimes = [] avg_balance_set = [] min_balance_set = [] fairness_error_set = [] E_cluster_set = [] E_cluster_discrete_set = [] bestacc = 1e10 best_avg_balance = -1 best_min_balance = -1 if args.lmbda_tune: print('Lambda tune is true') lmbdas = np.arange(0, 10000, 100).tolist() else: lmbdas = [args.lmbda] length_lmbdas = len(lmbdas) l = None if (not 'A' in locals()) and cluster_option == 'ncut': alg_option = 'flann' if N > 50000 else 'None' affinity_path = osp.join(data_dir, dataset + '_affinity_ncut.npz') knn = 20 if not osp.exists(affinity_path): A = utils.create_affinity(X, knn, savepath=affinity_path, alg=alg_option) else: A = utils.create_affinity(X, knn, W_path=affinity_path) init_C_path = osp.join( data_dir, '{}_init_{}_{}.npz'.format(dataset, cluster_option, K)) if not osp.exists(init_C_path): print('Generating initial seeds') C_init, l_init = km_init(X, K, 'kmeans_plus') np.savez(init_C_path, C_init=C_init, l_init=l_init) else: temp = np.load(init_C_path) C_init = temp['C_init'] # Load initial seeds l_init = temp['l_init'] for count, lmbda in enumerate(lmbdas): print('Inside Lambda ', lmbda) if cluster_option == 'ncut': C, l, elapsed, S, E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init=C_init, l_init=l_init, A=A) else: C, l, elapsed, S, E = fair_clustering(X, K, u_V, V_list, lmbda, fairness, cluster_option, C_init=C_init, l_init=l_init) min_balance, avg_balance = get_fair_accuracy(u_V, V_list, l, N, K) fairness_error = get_fair_accuracy_proportional(u_V, V_list, l, N, K) print( 'lambda = {}, \n fairness_error {: .2f} and \n avg_balance = {: .2f} \n min_balance = {: .2f}' .format(lmbda, fairness_error, avg_balance, min_balance)) # Plot the figure with clusters if dataset in ['Synthetic', 'Synthetic-unequal' ] and plot_option_clusters_vs_lambda == True: cluster_plot_location = osp.join(output_path, 'cluster_output') if not osp.exists(cluster_plot_location): os.makedirs(cluster_plot_location) filename = osp.join( cluster_plot_location, 'cluster-plot_fair_{}-{}_lambda_{}.png'.format( cluster_option, dataset, lmbda)) plot_clusters_vs_lambda(X_org, l, filename, dataset, lmbda, fairness_error) # if avg_balance > best_avg_balance: best_avg_balance = avg_balance best_lambda_avg_balance = lmbda if min_balance > best_min_balance: best_min_balance = min_balance best_lambda_min_balance = lmbda if fairness_error < bestacc: bestacc = fairness_error best_lambda_acc = lmbda if plot_option_convergence == True and count == 0: filename = osp.join( output_path, 'Fair_{}_convergence_{}.png'.format(cluster_option, dataset)) E_fair = E['fair_cluster_E'] plot_convergence(cluster_option, filename, E_fair) print('Best fairness_error %0.4f' % bestacc, '|Error lambda = ', best_lambda_acc) print('Best Avg balance %0.4f' % best_avg_balance, '| Avg Balance lambda = ', best_lambda_avg_balance) print('Best Min balance %0.4f' % best_min_balance, '| Min Balance lambda = ', best_lambda_min_balance) elapsetimes.append(elapsed) avg_balance_set.append(avg_balance) min_balance_set.append(min_balance) fairness_error_set.append(fairness_error) E_cluster_set.append(E['cluster_E'][-1]) E_cluster_discrete_set.append(E['cluster_E_discrete'][-1]) avgelapsed = sum(elapsetimes) / len(elapsetimes) print('avg elapsed ', avgelapsed) if plot_option_fairness_vs_clusterE == True and length_lmbdas > 1: savefile = osp.join( data_dir, 'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format( cluster_option, dataset)) filename = osp.join( output_path, 'Fair_{}_fairness_vs_clusterEdiscrete_{}.png'.format( cluster_option, dataset)) plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set) if plot_option_balance_vs_clusterE == True and length_lmbdas > 1: savefile = osp.join( data_dir, 'Fair_{}_balance_vs_clusterEdiscrete_{}.npz'.format( cluster_option, dataset)) filename = osp.join( output_path, 'Fair_{}_balance_vs_clusterEdiscrete_{}.png'.format( cluster_option, dataset)) plot_balance_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, min_balance_set, avg_balance_set, E_cluster_discrete_set)
SSD_TO_RAW_CLASS_MAPPING = { 7: 1, # vehicle 15: 2, # pedestrian 2: 3, # cyclist # 21: 20, # traffic lights } RAW_TO_SSD_CLASS_MAPPING = { 1: 7, # vehicle 2: 15, # pedestrian 3: 2, # cyclist # 20: 21, # traffic lights } logger = Logger.get_logger('SSD') class SSDModel(BaseModel): """ SSD Model """ def __init__(self): BaseModel.__init__(self, ModelConstants.MODEL_NAME) self.session = None self.image_4d = None self.predictions = None self.localisations = None self.img_input = None # tf placeholder self.bbox_img = None self.net_shape = (300, 300) self.ssd_anchors = None
d_output = 8 # From dataset # Config sns.set() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device {device}") # Load dataset ozeDataset = OzeDataset(DATASET_PATH) # Load network # Load transformer with Adam optimizer and MSE loss function loss_function = OZELoss(alpha=0.3) logger = Logger('learningcurve_log.csv') learningcurveIterator = leargnin_curve(ozeDataset, n_part=PARTS, validation_split=VALIDATION_SPLIT, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) with tqdm(total=PARTS*EPOCHS) as pbar: for dataloader_train, dataloader_val in learningcurveIterator: # Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size, dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device) optimizer = optim.Adam(net.parameters(), lr=LR) # Fit model loss = fit(net, optimizer, loss_function, dataloader_train,
import os from src.utils import Config, Logger logger = Logger.get_logger('BaseModel') class BaseModel(object): def __init__(self, model_name): self.asset_dir = os.path.join(Config.get('models_dir'), model_name) os.system('mkdir -p {}'.format(self.asset_dir)) self.asset_url_map = {} model_configs = Config.get('models') for conf in model_configs: if conf.get('name') == model_name: asset_urls = conf.get('asset_urls') for asset in asset_urls: self.asset_url_map[asset['name']] = asset['url'] def _download_asset(self, asset_name): logger.debug('Downloading asset: {}'.format(asset_name)) full_asset_name = os.path.join(self.asset_dir, asset_name) if os.path.exists(full_asset_name): logger.debug('Skip downloading, use cached files instead.') return os.system('wget {} -O {}'.format(self.asset_url_map.get(asset_name), full_asset_name))
import os import ujson from src.model import SSDModel from src.utils import Config, Logger, VideoProcessor logger = Logger.get_logger('ServeHandler') class ServeHandler(object): model = None scores = [] frame_cnt = 0 use_precomputed = False @classmethod def handle(cls): if Config.get('model') == 'ssd': cls.model = SSDModel() logger.debug('Start serving ...') full_video_path = os.path.join(Config.get('videos_dir'), Config.get('serve').get('video')) url = None precomputed_labels = None full_annotated_path = None confs = Config.get('videos') for conf in confs: if conf.get('name') == Config.get('serve').get('video'): url = conf.get('url')
def __init__(self): self._empty_query_msg = 'Empty search query.' self._invalid_result_size_msg = 'Invalid result size.' self.__index = InvertedIndex() self.__logger = Logger().get_logger(__name__)
from src.data import Processor from src.utils import Config, Logger import urllib logger = Logger.get_logger('TrainHandler') class TrainHandler(object): train_sets = Config.get('train').get('train_sets', []) test_sets = Config.get('train').get('test_sets', []) @classmethod def handle(cls): cls._download_data() cls._convert_data() cls._split_data() cls._train() @classmethod def _download_data(cls): logger.debug('Fetching data sets: ' + str(cls.train_sets)) for name in cls.train_sets: Processor.download(name) for name in cls.test_sets: Processor.download(name) @classmethod def _convert_data(cls): pass
class DataReader: """ Class for loading and processing raw tweets. Attributes ---------- df : pd.DataFrame Data frame with raw text and cleared tokens. Columns: Name: raw_tweets, dtype: str Name: tokens, dtype: List[str] Name: tokens_count, dtype: int Name: tag, dtype: int """ def __init__(self, text_file: str, tags_file: str = None, force_reload: bool = False) -> None: self._logger = Logger('io') self._preprocessor = Preprocessor() self.df = self._load_data(text_file, tags_file, force_reload) self._stats = None self.stats def _load_data(self, tweets_path: str, tags_path: str, force_reload: bool = False) -> pd.DataFrame: """ Load dataframe with cleared and tokenized tweets. First tries to load processed data from pickle. If pickle not found, or ``force_reload`` is True, reads raw data and run processing. Parameters ---------- tweets_path : str Name of a file with raw texts. tags_path : str Name of a file with tags. force_reload : bool If true loads from raw data even if pickle found. Returns ------- pd.DataFrame Data frame with raw text and cleared tokens. """ pickle_path = tweets_path.replace('.txt', '.pkl').replace('raw', 'processed') pickle_folder, pickle_name = os.path.split(pickle_path) if (pickle_name in os.listdir(pickle_folder)) & ~force_reload: self._logger.log('reading from pickle') with open(pickle_path, "rb") as f: df = pickle.load(f) else: self._logger.log('processing raw data') df = self._build_dataframe(tweets_path, tags_path) self._logger.log('data ready') return df def _build_dataframe(self, tweets_path: str, tags_path: str) -> pd.DataFrame: """ Clear and tokenize raw texts. Pickle processed data Parameters ---------- tweets_path : str Name of a file with raw texts. tags_path : str Name of a file with tags. Returns ------- pd.DataFrame Data frame with raw text and cleared tokens. """ with open(tweets_path) as f: raw_tweets = f.readlines() df = pd.DataFrame(raw_tweets, columns=['raw_tweets']) df['tokens'] = self._preprocessor.transform(raw_tweets) df['tokens_count'] = df['tokens'].apply(len) if tags_path is not None: df['tag'] = pd.read_fwf(tags_path, header=None)[0] else: df['tag'] = np.nan pickle_path = tweets_path.replace('.txt', '.pkl').replace( 'raw', 'processed') with open(pickle_path, "wb") as p: pickle.dump(df, p) return df @property def stats(self): self._stats = dict() self._stats['tweets count'] = self.df.shape[0] self._stats['tokens in tweet distribution'] = self.df[ 'tokens_count'].describe([.25, .5, .75, .95, .99]) self._stats['unique tokens'] = len( {toc for tweet_toc in self.df['tokens'] for toc in tweet_toc}) self._stats['tags count'] = self.df['tag'].value_counts().sort_index() print("-------- stats --------") for stat, value in self._stats.items(): print(f"=======================\n{stat}:\n{value}")
class Preprocessor(BaseEstimator): """ Class for cleaning and tokenizing tweet's raw text Steps: 1. remove ``@anonymized_account`` tag 2. remove chars other than letters and spaces 3. remove duplicate spaces 4. apply lowercase 5. lemmatizes tokens with ``pl_spacy_model`` 6. convert polish diacritics to latin letters 7. drop adjacent equals letters 8. collapse words exploded with spaces 9. remove zero/one letter tokens """ def __init__(self, min_tok_len: int = 2): self._min_tok_len = min_tok_len self._logger = Logger('preproc') self._nlp = None def fit(self, tweets: Tweets, tags: Tags = None) -> Preprocessor: return self def transform_tweet(self, tweet: Tweet) -> Tokens: tweet: Tweet = self._base_cleanup(tweet) tokens: Tokens = self._tokenizer(tweet) tokens = [Preprocessor._latinize_diacritics(tok) for tok in tokens] tokens = [Preprocessor._drop_adjacent_equals(tok) for tok in tokens] tokens = [Preprocessor._collapse_exploded(tok) for tok in tokens] tokens = [tok for tok in tokens if len(tok) >= self._min_tok_len] return tokens def transform(self, tweets: Tweets, tags: Tags = None) -> List[Tokens]: tokens = [self.transform_tweet(tweet) for tweet in tweets] return tokens @staticmethod def _base_cleanup(tweet: Tweet) -> Tweet: """Keep only letters and spaces, apply to lower, remove ``@anonymized_account`` and extra spaces""" tweet = tweet.strip() tweet = re.sub(r'@anonymized_account', '', tweet) tweet = re.sub(r'[^\w\s]', '', tweet) tweet = re.sub(r'[0-9]', '', tweet) tweet = re.sub(r' +', ' ', tweet) tweet = tweet.lower() tweet = tweet.strip() return tweet def load_spacy_model(self) -> None: """Tokenize tweet""" if self._nlp is None: self._logger.log('loading spacy model') self._nlp = spacy.load('pl_spacy_model') def _tokenizer(self, tweet: Tweet) -> Tokens: """Tokenize tweet""" self.load_spacy_model() tokens = [tok.lemma_ for tok in self._nlp(tweet)] return tokens @staticmethod def _drop_adjacent_equals(tok: Token) -> Token: """ Remove adjacent duplicate characters. Examples -------- >>> _drop_adjacent_equals('kkk') 'k' >>> _drop_adjacent_equals('lekkie pióórko') 'lekie piórko' """ return ''.join(c[0] for c in itertools.groupby(tok)) @staticmethod def _collapse_exploded(tok: Token, separators: str = ' .-_') -> Token: """ Collapse word expanded with ``separators``. Example -------- >>> _collapse_exploded('jesteś b r z y d k i') 'jesteś brzydki' """ if len(tok) < 5: return tok remove = [] for i, l in enumerate(tok[2:-1]): if l in separators: if (tok[i - 2] in separators) & (tok[i + 2] in separators): if (tok[i - 1].isalpha()) & (tok[i + 1].isalpha()): remove.append(i) remove.append(i + 2) return ''.join([l for i, l in enumerate(tok) if i not in remove]) @staticmethod def _latinize_diacritics(tok: Token) -> Token: """ Convert polish diacritics to latin letters. Example -------- >>> _latinize_diacritics('gęśl') 'gesl' """ letters_diac = 'ąćęłńóśżźĄĆĘŁŃÓŚŻŹ' letters_latin = 'acelnoszzACELNOSZZ' table = str.maketrans(letters_diac, letters_latin) return tok.translate(table)
def __init__(self, min_tok_len: int = 2): self._min_tok_len = min_tok_len self._logger = Logger('preproc') self._nlp = None
from rest_framework.views import APIView from custom_decoraters.request_body_validator import request_body_validator from django.contrib.auth import authenticate, login from src.utils import HttpStatus, HttpResponse, MethodNotAllowedException, Logger from src.constants import LoginType from apps.models import BlogsAuth from custom_middlewares.validator import RequestBodyValidatorMiddleware from custom_decoraters import request_body_validator from apps.json_schema_validators import login_json_schema from custom_auth_backend.jwt.token import Token logger = Logger() class Login(APIView): def get(self, request): return HttpResponse(http_status=HttpStatus.HTTP_200_OK, data="Kept only for testing using drf view") @request_body_validator(login_json_schema) def post(self, request): self.sanitize_request_data(request.data) auth_obj = authenticate(request, auth_id="", password=request.data['password']) if isinstance(auth_obj, HttpResponse): return auth_obj elif auth_obj is not None: # login(request, auth_obj) # method is invoked to get same method invoked while register return self.login_response(auth_obj)
d_input = 38 # From dataset d_output = 8 # From dataset # Config device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device {device}") # Load dataset ozeDataset = OzeDataset(DATASET_PATH) # Load network # Load transformer with Adam optimizer and MSE loss function loss_function = OZELoss(alpha=0.3) logger = Logger(f'crossvalidation_log_{attention_size}_{h}_{N}.csv') kfoldIterator = kfold(ozeDataset, n_chunk=CHUNKS, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) with tqdm(total=CHUNKS * EPOCHS) as pbar: for dataloader_train, dataloader_val in kfoldIterator: # Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input, d_model, d_output, q, v,
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) # Start search n_steps = np.prod( [len(search_range) for search_range in search_params.values()]) logger = Logger('logs/search_log.csv', list(search_params.keys()) + ['loss']) with tqdm(total=n_steps * EPOCHS) as pbar: for params in itertools.product(*search_params.values()): params = { key: params[idx] for idx, key in enumerate(search_params.keys()) } pbar.set_postfix(params) # Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input=d_input, d_output=d_output, dropout=dropout, chunk_mode=chunk_mode, pe=pe,
batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS ) dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS ) # Start search n_steps = np.prod([len(search_range) for search_range in search_params.values()]) logger = Logger('search_log.csv', list(search_params.keys())) with tqdm(total=n_steps*EPOCHS) as pbar: for params in itertools.product(*search_params.values()): params = {key: params[idx] for idx, key in enumerate(search_params.keys())} pbar.set_postfix(params) # Load transformer with Adam optimizer and MSE loss function net = Transformer(d_input=d_input, d_output=d_output, dropout=dropout, chunk_mode=chunk_mode, pe=pe, **params).to(device) optimizer = optim.Adam(net.parameters(), lr=LR)
else: # Download if not exist already if not os.path.isfile(X_train_path): urlretrieve(args.blob_path + "/X_train.npy", X_train_path) if not os.path.isfile(y_train_path): urlretrieve(args.blob_path + "/y_train.npy", y_train_path) if not os.path.isfile(X_valid_path): urlretrieve(args.blob_path + "/X_valid.npy", X_valid_path) if not os.path.isfile(y_valid_path): urlretrieve(args.blob_path + "/y_valid.npy", y_valid_path) X_t = np.load(X_train_path) y_t = np.load(y_train_path) X_v = np.load(X_valid_path) y_v = np.load(y_valid_path) params = vars(args) mnt_path = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'), 'tensorflow') # azurefile mount path ts = int(round(time.time() * 1000)) params['model_dir'] = os.path.join(mnt_path, '{}_model'.format(ts)) params['log_dir'] = os.path.join(mnt_path, '{}_logs'.format(ts)) logger = Logger(None, 'katib') logger.log( 'model_id', ts ) # This is hack, storing the model id as a metric in order to record it. train(X_t, y_t, X_v, y_v, logger=logger, **params)
help="Directory for loading model") args, _ = parser.parse_known_args() # Download and load data mnist_path = os.path.join('data', 'mnist') os.makedirs(mnist_path, exist_ok=True) X_test_path = os.path.join(mnist_path, 'X_test.npy') y_test_path = os.path.join(mnist_path, 'y_test.npy') if not args.blob_path: raise ValueError("Data path should be provided") else: # Download if not exist already if not os.path.isfile(X_test_path): urlretrieve(args.blob_path + "/X_test.npy", X_test_path) if not os.path.isfile(y_test_path): urlretrieve(args.blob_path + "/y_test.npy", y_test_path) X_t = np.load(X_test_path) y_t = np.load(y_test_path) test_acc = test( os.path.join(args.model_dir, "mnist-tf.model.meta"), os.path.join(args.model_dir, "mnist-tf.model"), X_t, y_t, logger=Logger(logging, 'python'), verbose=False, )