def normalize_features(self, extraname=''): # Setting up folders and filenames self._feat_dir = self.get_unnormalized_feat_dir(extraname) self._feat_dir_norm = self.get_normalized_feat_dir(extraname) utils.create_folder(self._feat_dir_norm) normalized_features_wts_file = self.get_normalized_wts_file() # pre-processing starts print('Estimating weights for normalizing feature files:') print('\t\tfeat_dir {}'.format(self._feat_dir)) spec_scaler = joblib.load(normalized_features_wts_file) print('Normalizing feature files:') # spec_scaler = joblib.load(normalized_features_wts_file) #load weights again using this command for file_cnt, file_name in enumerate(os.listdir(self._feat_dir)): print(file_cnt, file_name) feat_file = np.load(os.path.join(self._feat_dir, file_name)) feat_file = spec_scaler.transform(np.concatenate((np.abs(feat_file), np.angle(feat_file)), axis=1)) np.save( os.path.join(self._feat_dir_norm, file_name), feat_file ) del feat_file print('normalized files written to {} folder and the scaler to {}'.format( self._feat_dir_norm, normalized_features_wts_file))
def tool(is_testing, column, mode, option): global TRAIN, TEST if is_testing: TRAIN = TESTING_TRAIN_FILE TEST = TESTING_TEST_FILE if mode == "purge": purge_duplicated_records(column) elif mode == "restructure": for filetype in ["train", "test"]: hierarchical_folder_structure(column, filetype) elif mode == "repair": repair_missing_records(column) elif mode == "aggregation": columns = [COLUMNS[c] for c in column.split(",")] output_filepath = os.path.join(STATS_PATH, "{}.csv".format("_".join(columns))) create_folder(output_filepath) aggregation(columns, output_filepath) elif mode == "cc": column, column_value = option filepath = os.path.join(SPLIT_PATH, COLUMNS[column], "train", "{}.csv".format(column_value)) cc(filepath) else: log("Not found this mode {}".format(mode), ERROR) sys.exit(101)
def compute_scaler(data_type): """Compute and write out scaler of data. """ workspace = config.workspace if data_type == 'train': snr = config.Tr_SNR # Load data. t1 = time.time() hdf5_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "data.h5") with h5py.File(hdf5_path, 'r') as hf: x = hf.get('x') x = np.array(x) # (n_segs, n_concat, n_freq) # Compute scaler. (n_segs, n_concat, n_freq) = x.shape x2d = x.reshape((n_segs * n_concat, n_freq)) scaler = StandardScaler(with_mean=True, with_std=True).fit(x2d) # print(scaler.mean_) # print(scaler.scale_) # Write out scaler. out_path = os.path.join(workspace, "packed_features", "spectrogram", data_type, "%ddb" % int(snr), "scaler.p") create_folder(os.path.dirname(out_path)) pickle.dump(scaler, open(out_path, 'wb')) print("Save scaler to %s" % out_path) print("Compute scaler finished! %s s" % (time.time() - t1, ))
def __init__(self, info_dir=os.path.join(os.path.dirname(__file__), "request_info"), request_num=100, request_kind='nym', seed='000000000000000000000000Trustee1', thread_num=1, log=False): super().__init__(log, seed) self.info_dir = info_dir self.req_num = request_num self.req_kind = request_kind if thread_num <= 0: self.thread_num = 1 elif request_num < thread_num: self.thread_num = request_num else: self.thread_num = thread_num self.info_file_path = "{}_{}_{}.txt".format( self.req_kind + "_requests_info", str(threading.get_ident()), time.strftime("%d-%m-%Y_%H-%M-%S")) self.info_file_path = os.path.join(self.info_dir, self.info_file_path) self.req_info = list() self.threads = list() utils.create_folder(self.info_dir)
def preprocess(mode, columns, n_jobs): global WORKSPACE, TRAIN_FILE, TEST_FILE if mode: if mode.lower() == "producer": producer(columns) else: consumer(n_jobs=n_jobs) else: queue = Queue.Queue() for filepath in [TRAIN_FILE, TEST_FILE]: df = pd.read_csv(filepath) for column in columns.split(","): column = COLUMNS[column] output_folder = os.path.join(WORKSPACE, "split", column, os.path.basename(filepath).replace(".csv", "")) create_folder(os.path.join(output_folder, "1.txt")) for n in range(0, n_jobs): thread = SplitThread(kwargs={"df": df, "queue": queue}) thread.setDaemon(True) thread.start() for unique_value in df[column].unique(): output_filepath = os.path.join(output_folder, "{}.csv".format(unique_value)) if os.path.exists(output_filepath): log("Found {} so skipping it".format(output_filepath), INFO) else: queue.put((output_filepath, None, column, unique_value, None)) log("Put {} into the queue".format(output_filepath), INFO) queue.join()
def reduce_predicates(folder_name_pred, output_file_name, file_name_filters=""): """ Goes through all files in folder, reads content and checks, if all predicates in file are true. Writes reduced result to output file. """ # Creates output file create_folder(const.FOLDER_SORT_REDUCTION) file_name_reduction = "%s%s" % (const.FOLDER_SORT_REDUCTION, output_file_name) file_reduction = open(file_name_reduction, "w+") for file_name in os.listdir(folder_name_pred): if not verify_file_name(file_name, file_name_filters, const.FILE_EXTENSION): continue # Reduces all predicates to only one predicate "True" of "False" with open(folder_name_pred + file_name) as file_sort: file_content = file_sort.read() predicates_true = all(int(predicate) for predicate in file_content.split()) sort_name = file_name.replace("_", " ") sort_output = "%s%s%s" % (sort_name, const.SEPARATOR, predicates_true) print(sort_output, file=file_reduction) file_reduction.close()
def main(argv): # arguments parsing parser = argparse.ArgumentParser() parser.add_argument('-c', '--config', help="config file (default: config/development.conf", default="./config/development.conf") args = parser.parse_args() config = utils.read_config(args.config) years = range(config['scraping']['years_range'][0], config['scraping']['years_range'][1] + 1) n_proc = config['scraping']['n_proc'] # create the folders in which the poster will be downloaded for year in years: utils.create_folder('{}/{}/posters'.format(PATH_IMGS, year)) utils.create_folder('{}/{}/thumbnails'.format(PATH_IMGS, year)) # Downloading the posters with multiprocessing (highly speed up compare to single process) print('Retrieve url of posters') with Pool(n_proc) as p: yearly_urls = p.map(get_yearly_url_imgs, years) yearly_urls = list(itertools.chain.from_iterable(yearly_urls)) # push to db session = db_manager.get_db(config['general']['db_uri']) objects = [db_manager.Poster(x) for x in yearly_urls] session.bulk_save_objects(objects) session.commit()
def generate_images_for_single_image_masks(dicom_images, inference_results, output_folder): """ This function will save images to disk to preview how a mask looks on the input images. It saves one image for each input DICOM file with the corresponding `inference_results` mask applied as overlay. - dicom_images: Array of DCM_Image or path to a folder with images - inference_results: Array with mask buffers (one for each image) - output_folder: Where the output images will be saved The difference with `generate_images_with_masks` is that `generate_images_with_masks` applies each mask to the whole volume while this functions applies each mask to one image. """ images, masks = _get_images_and_masks(dicom_images, inference_results) create_folder(output_folder) mask_alpha = 0.5 for index, (image, mask) in enumerate(zip(images, masks)): dcm = pydicom.dcmread(image.path) pixels = _get_pixels(dcm) max_value = np.iinfo(pixels.dtype).max image_mask = mask pixels = np.reshape(pixels, (-1, 3)) pixels[image_mask > 128] = pixels[image_mask > 128] * (1 - mask_alpha) + \ (mask_alpha * np.array(get_colors(0, max_value)).astype(np.float)).astype(np.uint8) output_filename = os.path.join( output_folder, str(index) + os.path.basename(os.path.normpath(image.path))) output_filename += '.png' pixels = np.reshape(pixels, (dcm.Rows, dcm.Columns, 3)) plt.imsave(output_filename, pixels)
def save_cache(obj, filepath, is_json=False, is_hdb=False): if is_hdb: filepath += ".hdb" hdb = db.DB() hdb.open(filepath, None, db.DB_HASH, db.DB_CREATE) for test_id, info in obj.items(): hdb.put(str(test_id), pickle.dumps(info)) hdb.sync() hdb.close() log("Save cache in BerkeleyDB format({})".format(filepath), INFO) elif is_json: filepath += ".json.bz2" with BZ2File(filepath, "wb") as OUTPUT: json.dump(obj, OUTPUT) log("Save cache in JSON format({})".format(filepath), INFO) else: create_folder(filepath) with open(filepath, "wb") as OUTPUT: pickle.dump(obj, OUTPUT) log("Save {}'s cache in {}".format(obj.__class__, filepath), INFO)
def _split_originals(AUDIO_FILE_PATH): output_folder = "/audio_files/dataset/original_splits" if os.path.exists(output_folder): return output_folder audio_files = list() for root, dirnames, filenames in os.walk(AUDIO_FILE_PATH): for f in filenames: ext = f.split('.')[-1] if ext in ['mp3', 'wav', 'm4a']: file_path = os.path.join(root, f) audio_files.append(file_path) np.random.shuffle(audio_files) train_cnt = int(np.ceil(len(audio_files) * 0.8)) for src_path in audio_files: folder_name = '' if train_cnt > 0: train_cnt -= 1 folder_name = 'train' else: folder_name = 'test' dirname = os.path.basename(os.path.dirname(src_path)) outname = os.path.basename(src_path) dst_path = os.path.join(output_folder, folder_name, dirname) utils.create_folder(dst_path) dst_path = os.path.join(dst_path, outname) shutil.copyfile(src_path, dst_path) print(src_path, dst_path) return output_folder
def analyze_model(self, epoch): filename_model = os.path.join(self.dir_models, 'epoch_{}.pth'.format(epoch)) g = Generator(self.nb_channels_first_layer, self.dim) g.cuda() g.load_state_dict(torch.load(filename_model)) g.eval() nb_samples = 50 batch_z = np.zeros((nb_samples, 32 * self.nb_channels_first_layer, 4, 4)) # batch_z = np.maximum(5*np.random.randn(nb_samples, 32 * self.nb_channels_first_layer, 4, 4), 0) # batch_z = 5 * np.random.randn(nb_samples, 32 * self.nb_channels_first_layer, 4, 4) for i in range(4): for j in range(4): batch_z[:, :, i, j] = create_path(nb_samples) # batch_z[:, :, 0, 0] = create_path(nb_samples) # batch_z[:, :, 0, 1] = create_path(nb_samples) # batch_z[:, :, 1, 0] = create_path(nb_samples) # batch_z[:, :, 1, 1] = create_path(nb_samples) batch_z = np.maximum(batch_z, 0) z = Variable(torch.from_numpy(batch_z)).type(torch.FloatTensor).cuda() temp = g.main._modules['4'].forward(z) for i in range(5, 10): temp = g.main._modules['{}'.format(i)].forward(temp) g_z = temp.data.cpu().numpy().transpose((0, 2, 3, 1)) folder_to_save = os.path.join(self.dir_experiment, 'epoch_{}_path_after_linear_only00_path'.format(epoch)) create_folder(folder_to_save) for idx in range(nb_samples): filename_image = os.path.join(folder_to_save, '{}.png'.format(idx)) Image.fromarray(np.uint8((g_z[idx] + 1) * 127.5)).save(filename_image)
def __init__(self, parameters): dir_datasets = os.path.expanduser('~/datasets') dir_experiments = os.path.expanduser('~/experiments') dataset = parameters['dataset'] train_attribute = parameters['train_attribute'] test_attribute = parameters['test_attribute'] embedding_attribute = parameters['embedding_attribute'] self.dim = parameters['dim'] self.nb_channels_first_layer = parameters['nb_channels_first_layer'] name_experiment = parameters['name_experiment'] self.dir_x_train = os.path.join(dir_datasets, dataset, '{0}'.format(train_attribute)) self.dir_x_test = os.path.join(dir_datasets, dataset, '{0}'.format(test_attribute)) self.dir_z_train = os.path.join(dir_datasets, dataset, '{0}_{1}'.format(train_attribute, embedding_attribute)) self.dir_z_test = os.path.join(dir_datasets, dataset, '{0}_{1}'.format(test_attribute, embedding_attribute)) self.dir_experiment = os.path.join(dir_experiments, 'gsn_hf', name_experiment) self.dir_models = os.path.join(self.dir_experiment, 'models') self.dir_logs = os.path.join(self.dir_experiment, 'logs') create_folder(self.dir_models) create_folder(self.dir_logs) self.batch_size = 128 self.nb_epochs_to_save = 1
def _generate_path(dir_z, dir_x, train_test): dataset = EmbeddingsImagesDataset(dir_z, dir_x) fixed_dataloader = DataLoader(dataset, 2, shuffle=True) fixed_batch = next(iter(fixed_dataloader)) z0 = fixed_batch['z'][[0]].numpy() z1 = fixed_batch['z'][[1]].numpy() batch_z = np.copy(z0) nb_samples = 100 interval = np.linspace(0, 1, nb_samples) for t in interval: if t > 0: zt = normalize((1 - t) * z0 + t * z1) batch_z = np.vstack((batch_z, zt)) z = Variable(torch.from_numpy(batch_z)).type(torch.FloatTensor).cuda() g_z = g.forward(z) # filename_images = os.path.join(self.dir_experiment, 'path_epoch_{}_{}.png'.format(epoch, train_test)) # temp = make_grid(g_z.data, nrow=nb_samples).cpu().numpy().transpose((1, 2, 0)) # Image.fromarray(np.uint8((temp + 1) * 127.5)).save(filename_images) g_z = g_z.data.cpu().numpy().transpose((0, 2, 3, 1)) folder_to_save = os.path.join(self.dir_experiment, 'epoch_{}_{}_path'.format(epoch, train_test)) create_folder(folder_to_save) for idx in range(nb_samples): filename_image = os.path.join(folder_to_save, '{}.png'.format(idx)) Image.fromarray(np.uint8((g_z[idx] + 1) * 127.5)).save(filename_image)
def init_log_file(path: str): """ Initiate log file. """ RequestsSender.close_log_file() utils.create_folder(os.path.dirname(path)) RequestsSender.__log_file = open(path, 'w')
def resample_folder(inpath, outpath, timeColHeader, gapTolerance=np.inf, samplingRate=None): ''' :param inpath: :param outpath: :param timeColHeader: :param gapTolerance: :param samplingRate: :return: ''' create_folder(outpath) files = list_files_in_directory(inpath) for file in files: if not file.startswith('.'): dataDf = pd.read_csv(os.path.join(inpath, file)) if len(dataDf): if 'date' in dataDf.columns: dataDf = dataDf.drop(columns=['date']) # print(dataDf.dtypes) # dataDf = dataDf.astype({"Time": float}) newDf = resample(dataDf, timeColHeader, samplingRate, gapTolerance=np.inf, fixedTimeColumn=None) newDf.to_csv(os.path.join(outpath, file), index=None)
def reduce_predicates(folder_name_pred, output_file_name, file_name_filters=""): """ Goes through all files in folder, reads content and checks, if all predicates in file are true. Writes reduced result to output file. """ # Creates output file create_folder(const.FOLDER_SORT_REDUCTION) file_name_reduction = "%s%s" % (const.FOLDER_SORT_REDUCTION, output_file_name) file_reduction = open(file_name_reduction, "w+") for file_name in os.listdir(folder_name_pred): if not verify_file_name(file_name, file_name_filters, const.FILE_EXTENSION): continue # Reduces all predicates to only one predicate "True" of "False" with open(folder_name_pred + file_name) as file_sort: file_content = file_sort.read() predicates_true = all( int(predicate) for predicate in file_content.split()) sort_name = file_name.replace("_", " ") sort_output = "%s%s%s" % (sort_name, const.SEPARATOR, predicates_true) print(sort_output, file=file_reduction) file_reduction.close()
def combine_bands(): '''Combines all specified bands per file''' print("-> Start combining bands...") # Create Out folder TEMP_FOLDERS["combined_bands"] = create_folder(OUT_FOLDER, "03_combined_bands") # Iterate over each day for day in listdir(TEMP_FOLDERS["unzipped"]): # Create one folder per day folder_day = create_folder(TEMP_FOLDERS["combined_bands"], day) # Iterate over each granule for granule in listdir("{0}/{1}".format(TEMP_FOLDERS["extracted"], day)): # Get list of input files band_path_list = get_paths_for_files_in_folder( "{0}/{1}/{2}/".format(TEMP_FOLDERS["extracted"], day, granule)) band_path_list.sort() # make sure bands are always in same order # Build out_path (filename without file extension and band number) out_filename = "{0}.vrt".format( band_path_list[0].split("/")[-1].split(".")[0][:-4]) out_path = "{0}/{1}".format(folder_day, out_filename) # Combine all dataset bands in one vrt-file gdal.BuildVRT(out_path, band_path_list, separate=True, srcNodata=0) print(" - Combined bands for {0}".format(out_filename)) print("-> Finished combining bands.")
def __init__(self, parameters): dir_datasets = os.path.expanduser('~/datasets') dir_experiments = os.path.expanduser('~/experiments') dataset = parameters['dataset'] train_attribute = parameters['train_attribute'] test_attribute = parameters['test_attribute'] embedding_attribute = parameters['embedding_attribute'] self.dim = parameters['dim'] self.nb_channels_first_layer = parameters['nb_channels_first_layer'] name_experiment = parameters['name_experiment'] self.dir_x_train = os.path.join(dir_datasets, dataset, '{0}'.format(train_attribute)) self.dir_x_test = os.path.join(dir_datasets, dataset, '{0}'.format(test_attribute)) self.dir_z_train = os.path.join( dir_datasets, dataset, '{0}_{1}'.format(train_attribute, embedding_attribute)) self.dir_z_test = os.path.join( dir_datasets, dataset, '{0}_{1}'.format(test_attribute, embedding_attribute)) self.dir_experiment = os.path.join(dir_experiments, 'gsn_hf', name_experiment) self.dir_models = os.path.join(self.dir_experiment, 'models') self.dir_logs = os.path.join(self.dir_experiment, 'logs') create_folder(self.dir_models) create_folder(self.dir_logs) self.batch_size = 128 self.nb_epochs_to_save = 1
def view_samples(args): # visualize some sample images all_image_path = glob(os.path.join(args.path, '*', '*.jpg')) imageid_path_dict = { os.path.splitext(os.path.basename(x))[0]: x for x in all_image_path } save_path = "./images/sample" create_folder(save_path) fig = plt.figure(figsize=(5, 5)) columns, rows = 3, 2 start, end = 0, len(imageid_path_dict) ax = [] import random for i in range(columns * rows): k = random.randint(start, end) img = mpimg.imread(all_image_path[k]) # create subplot and append to ax ax.append(fig.add_subplot(rows, columns, i + 1)) plt.xticks([]) plt.yticks([]) plt.imshow(img) plt.tight_layout() plt.title('Sample input images', fontdict={'size': 10}) plt.savefig(save_path + 'input_image.png') # Checking the size and number of channels in the image arr = np.asarray(Image.open(all_image_path[10])) print(f"The shape of each image is {arr.shape}")
def extract_content(self): classes = self.extract_classes() for klass in classes[1:]: # Exclude ONLINE CLASS folder_name = remove_accents(klass['class']) create_folder(folder_name) print('Extracting Class: {0}'.format(klass['class'])) self.browser.get('https://unipac-bomdespacho.blackboard.com{0}'.format(klass['href'])) self.browser.find_element_by_id('header::0-whatsNewView::CO').click() # Open content list block_class_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO') class_contents = block_class_contents.find_elements_by_css_selector( "a[onclick*='nautilus_utils.actionSelected']" ) i_content = 0 for i_content in range(i_content, len(class_contents)): try: block_classes_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO') class_contents = block_classes_contents.find_elements_by_css_selector( "a[onclick*='nautilus_utils.actionSelected']" ) class_contents[i_content].click() self.check_visibility(By.CLASS_NAME, "individualContent-link") file_link = self.browser.find_element_by_class_name('individualContent-link').get_attribute('href') cookies = self.browser.get_cookies() download(cookies, file_link, folder_name) self.browser.back() self.check_visibility(By.ID, "block::0-whatsNewView::CO") except TimeoutException: print("Error in: {0} - {1}".format(klass['class'], klass['href']))
def main(): img_dirs = [ './img/org', './img/bg_0', './img/bg_127', './img/bg_255', './img/obj_0', './img/obj_127', './img/obj_255' ] arch = "resnet152" target_layer = "layer4" cuda = True topk = 1 for img_dir in img_dirs: output_dir = "./cam_{}_{}_{}".format(arch, target_layer, img_dir.replace("/", '_')) create_folder(output_dir) image_paths = [] for i in range(0, 50000): filename = "ILSVRC2012_val_000{:05}.JPEG".format(i + 1) image_path = os.path.join(img_dir, filename) image_paths.append(image_path) for images in list(chunks(image_paths, n=16)): # process_a_batch(images, target_layer, arch, topk, output_dir, cuda) arguments = [ "python3", "run.py", "-a", "{}".format(arch), "-t", "{}".format(target_layer), "-o", "{}".format(output_dir), "-i" ] for img in images: # arguments.append("-i") arguments.append("{}".format(img)) print(arguments) subprocess.call(arguments)
def generate_images_with_boxes(images, boxes, output_folder): # Generate images for boxes. `boxes` should be an array of dict # Format: {'label': '?', 'SOPInstanceUID': dcm.SOPInstanceUID, 'top_left': [5, 5], 'bottom_right': [10, 10]} create_folder(output_folder) for index, image in enumerate(images): dcm = pydicom.dcmread(image.path) pixels = get_pixels(dcm) pixels = np.reshape(pixels, (dcm.Rows, dcm.Columns, 3)) pil_image = Image.fromarray(pixels) draw = ImageDraw.Draw(pil_image) image_boxes = [ box for box in boxes if image.instanceUID == box['SOPInstanceUID'] ] for box in image_boxes: # apply box ul = box['top_left'] br = box['bottom_right'] points = [ tuple(ul), (br[0], ul[1]), tuple(br), (ul[0], br[1]), tuple(ul) ] draw.line(points, fill="red", width=5) boxes.remove(box) # write image to output folder output_filename = os.path.join( output_folder, str(index) + '_' + os.path.basename(os.path.normpath(image.path))) output_filename += '.png' pil_image.save(output_filename)
def visualise_epoch(data_container, model, args, cuda, base_path): base_path_ae = os.path.join(base_path, 'ae_vis') base_path_dualatt = os.path.join(base_path, 'dualatt_vis') create_folder(base_path_ae) create_folder(base_path_dualatt) model.eval() i = 0 for x, _, audio_names in tqdm(data_container['val_dataloader']): if cuda: x = x.cuda() out_dict = model(x) y_pred = out_dict['y_pred'].cpu().detach().numpy() x_rec = out_dict['x_rec'].cpu().detach().numpy() class_x = out_dict['class_wise_input'].cpu().detach().numpy() mel_attw = out_dict['mel_attw'].cpu().detach().numpy() time_attw = out_dict['time_attw'].cpu().detach().numpy() mel_x = out_dict['mel_x'].cpu().detach().numpy() time_x = out_dict['time_x'].cpu().detach().numpy() x = x.cpu().detach().numpy() # here i maintains sample count (global) # here j maintains count inside batch (local) for j in range(x.shape[0]): reconstruction_plot(x[j], x_rec[j], args, audio_names[j], base_path_ae) attention_plot(mel_x[j], mel_attw[j], time_x[j], time_attw[j], args, audio_names[j], base_path_dualatt) i = i + 1
def _place_info_index(df, range_x, range_y, size_x, size_y, output_folder): for window_size_x, window_size_y in zip(size_x, size_y): folder = output_folder + "_windown_size={},{}".format(window_size_x, window_size_y) for x in range_x: start_x, end_x = x, min(11, x+window_size_x) c2 = (df["x"].values >= start_x) & (df["x"].values < end_x) for y in range_y: start_y, end_y = y, min(11, y+window_size_y) c3 = (df["y"].values >= start_y) & (df["y"].values < end_y) if df[c2 & c3].shape[0] > 0: filepath_output = os.path.join(folder, "{}_{}.csv".format(start_x, start_y)) if not os.path.exists(filepath_output): create_folder(filepath_output) with open(filepath_output, "wb") as OUTPUT: place_ids, counts = np.unique(df[c2 & c3]["place_id"].values, return_counts=True) for place_id, count in zip(place_ids, counts): OUTPUT.write("{},{}\n".format(place_id, count)) log("Save file in {}".format(filepath_output), INFO) else: log("Skip {}".format(filepath_output), INFO)
def reproject(): '''Reprojects all UTM zones into specified reference system''' print("-> Start reprojection...") # Create Out folder with subfolders for each day TEMP_FOLDERS["reproject"] = create_folder(OUT_FOLDER, "05_reproject") for day in listdir(TEMP_FOLDERS["unzipped"]): # Create one folder per day folder_day = create_folder(TEMP_FOLDERS["reproject"], day) for utm_file in listdir("{0}/{1}".format(TEMP_FOLDERS["utm"], day)): # Get input path in_path = "{0}/{1}/{2}".format(TEMP_FOLDERS["utm"], day, utm_file) # Build output path out_filename = "{0}_epsg{1}.vrt".format( utm_file.split(".")[0], OUT_EPSG) out_path = "{0}/{1}".format(folder_day, out_filename) # Reproject each utm file to set epsg gdal.Warp(out_path, in_path, dstSRS="EPSG:{0}".format(OUT_EPSG), format="vrt") print(" - Reprojected {0}".format(out_filename)) print("-> Finished reprojection.")
def prepare_mozilla_common_data(AUDIO_FILE_PATH): OUTPUT_FOLDER = "/audio_files/common_voice_corpus_1" exists = os.path.exists(OUTPUT_FOLDER) if not exists or (exists and len(os.listdir(OUTPUT_FOLDER)) <= 20000): _fix_duration_and_convert_audio(AUDIO_FILE_PATH, OUTPUT_FOLDER, file_size_thresh=17) audio_clips = os.listdir(OUTPUT_FOLDER) # total clips OUTPUT_FOLDER_2 = "/audio_files/dataset/classes/non-target" exists = os.path.exists(OUTPUT_FOLDER_2) if not exists or (exists and len(os.listdir(OUTPUT_FOLDER_2)) != 10000): utils.create_folder(OUTPUT_FOLDER_2) #Choose 10000 audio clips randomly # np.random.shuffle(audio_clips) # audio_clips = audio_clips[:10000] #save audio clips for aud_clip in audio_clips: src_file = os.path.join(OUTPUT_FOLDER, aud_clip) dst_file = os.path.join(OUTPUT_FOLDER_2, aud_clip) shutil.copyfile(src_file, dst_file) print('copying', src_file, dst_file) else: print("Noting to do")
def image_downloader(img_links, folder_name): img_names = [] try: parent = os.getcwd() try: folder = os.path.join(os.getcwd(), folder_name) utils.create_folder(folder) os.chdir(folder) except Exception: print("Error in changing directory.") for link in img_links: img_name = "None" if link != "None": img_name = (link.split(".jpg")[0]).split("/")[-1] + ".jpg" # this is the image id when there's no profile pic if img_name == selectors.get("default_image"): img_name = "None" else: try: urllib.request.urlretrieve(link, img_name) except Exception: img_name = "None" img_names.append(img_name) os.chdir(parent) except Exception: print("Exception (image_downloader):", sys.exc_info()[0]) return img_names
def write_result(initial, word, results): create_folder(BASE_FOLDER, MODEL_NAME, TIME_STR, initial) processed_results = sorted(results, key=lambda k: (k[0], -k[1])) write_data( f"./{BASE_FOLDER}/{MODEL_NAME}/{TIME_STR}/{initial}/{word}.txt", processed_results, )
def _generate_path(dir_z, dir_x, train_test): dataset = EmbeddingsImagesDataset(dir_z, dir_x) fixed_dataloader = DataLoader(dataset, 2, shuffle=True) fixed_batch = next(iter(fixed_dataloader)) z0 = fixed_batch['z'][[0]].numpy() z1 = fixed_batch['z'][[1]].numpy() batch_z = np.copy(z0) nb_samples = 100 interval = np.linspace(0, 1, nb_samples) for t in interval: if t > 0: # zt = normalize((1 - t) * z0 + t * z1) zt = (1 - t) * z0 + t * z1 batch_z = np.vstack((batch_z, zt)) z = torch.from_numpy(batch_z).float().cuda() g_z = g.forward(z) # filename_images = os.path.join(self.dir_experiment, 'path_epoch_{}_{}.png'.format(epoch, train_test)) # temp = make_grid(g_z.data, nrow=nb_samples).cpu().numpy().transpose((1, 2, 0)) # Image.fromarray(np.uint8((temp + 1) * 127.5)).save(filename_images) g_z = g_z.data.cpu().numpy().transpose((0, 2, 3, 1)) folder_to_save = dir_to_save / 'epoch_{}_{}_path'.format(epoch_to_load, train_test) create_folder(folder_to_save) for idx in range(nb_samples): filename_image = os.path.join(folder_to_save, '{}.png'.format(idx)) Image.fromarray(np.uint8((g_z[idx] + 1) * 127.5)).save(filename_image)
def train(self, X_train, X_test, y_train, y_test): # compile model optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001) num_outputs = self.model.outputs[0].shape[-1] if num_outputs == 1: loss = 'binary_crossentropy' metrics = [ tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall() ] else: loss = 'categorical_crossentropy', y_train = tf.one_hot(y_train, num_outputs) y_test = tf.one_hot(y_test, num_outputs) metrics = [ tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall() ] self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics) # self.model.summary() callbacks = list() log_dir = "output/logs" create_folder(log_dir) callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=log_dir)) checkpoint_filepath = 'output/checkpoints/chk-{epoch:02d}-{val_loss:.8f}.ckpt' checkpoint_dir = os.path.dirname(checkpoint_filepath) create_folder(checkpoint_dir) model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True, verbose=1) callbacks.append(model_checkpoint_callback) # train model try: self.model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=168, epochs=100, callbacks=callbacks) except KeyboardInterrupt: pass self.export() # load model with best weights and export self.model.evaluate(X_test, y_test, verbose=2)
def _time_split_data(df, new_column, value, output_folder): idx = (df[new_column] == value) filepath_output = os.path.join(output_folder, "{}.csv".format(value)) create_folder(filepath_output) df[idx].to_csv(filepath_output, index=False) log("Save file in {}".format(filepath_output), INFO)
def save_file(self, message_shown=True): create_folder() f = open("logs/data_third.txt", "w") f.write(str(self.result_value)) if message_shown: show_message("Інформацію збережено до файлу logs/data_third.txt", QMessageBox.Information) f.close()
def load(self, dir_name): rmtree(self._log_dir) create_folder(self._log_dir) onlyfiles = [f for f in listdir(dir_name) if isfile(join(dir_name, f))] for file in onlyfiles: copyfile(join(dir_name, file), join(self._log_dir, file))
def _seperate_one(self, _file): time = self._get_time(_file) folder = self._get_time_folder(time) fd_path = path_join([self.mv_path, folder, '']) if not path_exists(fd_path): create_folder(fd_path) print _file, fd_path print copy(_file, fd_path)
def save_file(self, message_shown=True): create_folder() f = open("logs/data_fourth.txt", "w") f.write(str(self.logic.get_z())) if message_shown: show_message("Інформацію збережено до файлу logs/data_fourth.txt", QMessageBox.Information) f.close()
def submit(self, model, filepath, mode="training", n_top=10): create_folder(filepath) (training_dataset, testing_dataset), results, predicted_proba = self.get_dataset(), None, None if mode == "training": if self.method == "classifier": if self.objective.find("binary") > -1: predicted_proba = model.predict_proba(training_dataset)[:,1] else: predicted_proba = model.predict_proba(training_dataset) elif self.method == "regressor": predicted_proba = model.predict(training_dataset) else: raise NotImplementError estimator = model if hasattr(model, "best_estimator_"): estimator = model.best_estimator_ pool = [dict(zip(model.best_estimator_.classes_, probas)) for probas in predicted_proba] for idx, pair in enumerate(pool): class_names = [] for class_name, class_proba in sorted(pair.items(), key=(lambda (k, v): v), reverse=True)[:n_top]: class_names.append(class_name) pool[idx] = " ".join(class_names) results = {"Target": self.train_y, "Predicted_Proba": pool} else: if self.method == "classifier": if self.objective.find("binary") > -1: predicted_proba = model.predict_proba(testing_dataset)[:,1] else: predicted_proba = model.predict_proba(testing_dataset) elif self.method == "regressor": predicted_proba = model.predict(testing_dataset) else: raise NotImplementError estimator = model if hasattr(model, "best_estimator_"): estimator = model.best_estimator_ pool = [dict(zip(estimator.classes_, probas)) for probas in predicted_proba] for idx, pair in enumerate(pool): class_names = [] for class_name, class_proba in sorted(pair.items(), key=(lambda (k, v): v), reverse=True)[:n_top]: class_names.append(class_name) pool[idx] = " ".join(class_names) results = {"ID": self.test_id, "Target": pool} if not os.path.exists(filepath): log("Compile a submission results for kaggle in {}".format(filepath), INFO) save_kaggle_submission(results, filepath)
def split_by_hour(file, NECKLACE_DIR): create_folder(NECKLACE_DIR) # This is a date days before the study, to remove the 1969 error data. starttimestamp = 1000000000000 #September 8, 2001 8:46:40 PM GMT-05:00 DST localtz = settings['TIMEZONE'] print(file) df = pd.read_csv(file) print('len', len(df), '\n') df = df[~df['Time'].isin(['Time'])] print('Remove redundant headers...\n') print('len', len(df), '\n') l1 = len(df) df = df.dropna() df['Time'] = pd.to_numeric(df['Time'], errors='ignore') df = df[df['Time'] > starttimestamp] print('len', len(df), '\n') l2 = len(df) print('# Timestamp 1969 Error Lines: ', str(l1 - l2)) df = df.sort_values('Time') df['date'] = pd.to_datetime(df['Time'],unit='ms') df = df.set_index(['date']) df.index = df.index.tz_localize('UTC').tz_convert(settings['TIMEZONE']) # dt: absolute hour of the first timestamp dt = datetime(year = df.index[0].year, month = df.index[0].month, \ day = df.index[0].day, hour = df.index[0].hour, minute = 0, second = 0) dt = localtz.localize(dt) print(df.index[0]) print(df.index[-1]) #======================================================================================================== # split each hour into separate file under day folder #======================================================================================================== startHour = dt endHour = dt + timedelta(hours = 1) while endHour < df.index[-1] + timedelta(hours = 1): dfHr = df[(df.index >= startHour) & (df.index < endHour)] if len(dfHr): file = datetime_to_filename(startHour) dfHr.to_csv(os.path.join(NECKLACE_DIR, file)) print(startHour) print(endHour) print(len(dfHr)) print(file) startHour += timedelta(hours = 1) endHour += timedelta(hours = 1)
def run_the_node(self, status_file, status_dict): """ Run the step represented by the node and updates the status.json file which gives a live output of the running process. It uses the status_file (location of the status file) and the status_dict (python dictionary representing the status.json file) to give a live report of the node being processed. """ utils.create_folder(self.output_folder) status_dict[self.name] = {} status_dict[self.name]["status"] = "in progress" status_dict[self.name]["progress"] = 0 utils.update_json_file(status_file, status_dict) cmd_line = [] cmd_line.append(self.binary_name) for option, value in self.add_locations_to_command_line(): cmd_line.append(option) cmd_line.append(value) for option, value in self.add_parameters_to_command_line(): cmd_line.append(option) cmd_line.append(value) log = open(self.log_dir, 'w') # Dealing with DepthMap particular case if (self.name == "depth_map"): # Dividing the task if needed group_size = self.parameters["groupSize"] number_of_groups = (self.nb_of_images + (group_size - 1)) // group_size for group_iter in range(number_of_groups): range_start = group_size * group_iter range_size = min(group_size, self.nb_of_images - range_start) print("DepthMap Group {}/{} : {}, {}".format( group_iter + 1, number_of_groups, range_start, range_size)) cmd = cmd_line + [ '--rangeStart', str(range_start), '--rangeSize', str(range_size) ] print(cmd) subprocess.run(cmd, stderr=log) status_dict[self.name]["progress"] = ( (group_iter + 1) / number_of_groups) * 100 print(status_dict) utils.update_json_file(status_file, status_dict) else: print(cmd_line) subprocess.run(cmd_line, stderr=log) status_dict[self.name]["progress"] = 100 utils.update_json_file(status_file, status_dict) log.close() status_dict[self.name]["status"] = "done" status_dict[self.name]["progress"] = 100 utils.update_json_file(status_file, status_dict) return 0
def get_agent_file(agent_name): directory_name = os.environ.get("MODELS_PATH") + "export/" file_name = agent_name + ".json" file_path = directory_name + file_name remove_file_or_dir(file_path) dic = AgentsService.get_instance().create_agent_file(agent_name) create_folder(directory_name) with open(file_path, "w+") as f: json.dump(dic,f) return(send_from_directory(directory = directory_name, filename = "./" + file_name, as_attachment = True))
def main(): # Create file with $1_$2.log # Create file with $1_$2_long.log # Start bot global log, log_long, log_directory create_folder(log_directory) log = create_file(log_directory + username + "_" + stream + ".log") log_long = create_file(log_directory + username + "_" + stream + "_long.log") bot()
def screenshot(self, event): self.assert_context_state() folder_path = self.configuration['results_folder'] screenshot_path = os.path.join(folder_path, "%s_%s.ppm" % (self.identifier, event)) create_folder(folder_path) with open(screenshot_path, 'wb') as screenshot_file: screenshot_stream = screenshot(self.context) screenshot_file.write(screenshot_stream)
def run(): config = utils.load_cfg('conf.cfg') template_path = config.get('image', 'filename') markers = {'school': config.getint('markers', 'school'), 'year': config.getint('markers', 'year'), 'level': config.getint('markers', 'level'), 'award': config.getint('markers', 'award'), 'recipient': config.getint('markers', 'recipient')} font = {'color': config.get('font', 'color'), 'name': config.get('font', 'name'), 'size': config.getint('font', 'size')} images_per_pdf = config.getint('pdf', 'images_per_pdf') if images_per_pdf > 6: # 6 is the maximum allowed number of images per pdf exit() csv_file = config.get('csv', 'filename') cc = CsvReader(csv_file) csv_data = cc.read() school = 'John Scottus School' year = '2016' image_folder = utils.create_folder('images') pdf_folder = utils.create_folder('pdfs') count = 0 images = [] draw_tool = ImageWriter(font) for row in csv_data: im = draw_tool.open_image(template_path) im = draw_tool.write_text(im, markers['school'], school) im = draw_tool.write_text(im, markers['year'], year) im = draw_tool.write_text(im, markers['level'], row['Level']) im = draw_tool.write_text(im, markers['award'], row['Award']) im = draw_tool.write_text(im, markers['recipient'], row['Recipient']) im_path = image_folder + '/' + 'image_' + utils.timestamp() + '.png' draw_tool.save_image(im, im_path) count += 1 images.append(im_path) if count % images_per_pdf == 0: create_pdf(pdf_folder, images) count = 0 images = [] create_pdf(pdf_folder, images)
def memory_snapshot(self, event): folder_path = self.configuration['results_folder'] file_name = "%s_%s.bin%s" % ( event, datetime.now().replace(microsecond=0).time().strftime("%H%M%S"), self.configuration.get('compress_snapshots', False) and '.gz' or '') snapshot_path = os.path.join(folder_path, file_name) create_folder(folder_path) self.dump_memory(snapshot_path) return snapshot_path
def start_trace_handler(self, event): folder_path = self.configuration['results_folder'] self.logger.debug("Event %s: starting network tracing.", event) create_folder(folder_path) self.pcap_path = os.path.join(folder_path, "%s.pcap" % self.identifier) self.tracer_process = launch_process( TSHARK, '-w', self.pcap_path, '-i', self.context.network.bridgeName()) self.context.trigger("network_tracing_started", path=self.pcap_path) self.logger.info("Network tracing started.")
def snapshot_to_checkpoint(volume, snapshot, folder_path): """Turns a QEMU internal snapshot into a QCOW file.""" create_folder(folder_path) name = snapshot.getName() path = os.path.join(folder_path, '%s.qcow2' % name) process = launch_process(QEMU_IMG, "convert", "-f", "qcow2", "-o", "backing_file=%s" % volume_backing_path(volume), "-O", "qcow2", "-s", name, volume_path(volume), path) collect_process_output(process) return path
def consumer(ip=IP_BEANSTALK, port=PORT_BEANSTALK, task=COMPETITION_GROUP_NAME, n_jobs=1): global WORKSPACE, TRAIN_FILE, TEST_FILE df_train = pd.read_csv(TRAIN_FILE) df_test = pd.read_csv(TEST_FILE) talk = beanstalkc.Connection(host=ip, port=port) talk.watch(task) hostname = socket.gethostname() queue = Queue.Queue() for n in range(0, n_jobs): thread = SplitThread(kwargs={"df_train": df_train, "df_test": df_test, "queue": queue}) thread.setDaemon(True) thread.start() while True: job = talk.reserve(timeout=TIMEOUT_BEANSTALK) if job: o = json.loads(job.body) filetype, output_filepaths, column, values = o["filetype"], o["output_filepath"], o["column"], o["value"] output_folder = None for output_filepath, value in zip(output_filepaths, values): output_folder = os.path.dirname(output_filepath) create_folder(output_filepath) queue.put((output_filepath, filetype, column, value)) queue.join() if hostname != ip: p = subprocess.Popen(["scp", "{}/*.csv".format(output_folder), "RungChiChen@{}:{}".format(IP_BEANSTALK, output_folder)]) pid, sts = os.waitpid(p.pid, 0) log("Transfer {} successfully({})".format(output_filepath, sts), INFO) if sts == 0: for f in os.listdir(output_folder): if f.endswith(".csv"): filepath = os.path.join(output_folder, f) os.remove(filepath) log("Remove {}".format(filepath), INFO) job.delete() queue.join() talk.close()
def _pos_split_data(df, x, range_y, window_size_x, window_size_y, output_folder): start_x, end_x = x, min(x+window_size_x, 11) c2 = (df["x"].values >= start_x) & (df["x"].values < end_x) for y in range_y: start_y, end_y = y, min(y+window_size_y, 11) c3 = (df["y"].values >= start_y) & (df["y"].values < end_y) filepath_output = os.path.join(output_folder, "windown_size={},{}".format(window_size_x, window_size_y), "{}_{}.csv".format(start_x, start_y)) if not os.path.exists(filepath_output): create_folder(filepath_output) final_df = df[c2 & c3] if final_df.shape[0] > 0: final_df.to_csv(filepath_output, index=False) log("Save file in {}".format(filepath_output), INFO)
def download_file(url, name, dest=".", number=1): print " {0}) In: {1}".format(number, url) filepath = os.path.join(create_folder(dest), name) try: urllib.urlretrieve(url, filepath) except: print " !!!! FAIL:", url print " Out: {}\n".format(filepath)
def reduce_sort_timings(folder_name, array_lens, file_name_filters=[], reduce_func=sort_rate): """Reduces sort timings and outputs them to files.""" for distribution in os.listdir(folder_name): folder_dist_input = "%s%s/" % (folder_name, distribution) folder_dist_output = "%s%s/" % (const.FOLDER_SORT_REDUCTION, distribution) create_folder(folder_dist_output) # Creates output file file_name_output = "%s%s%s" % ( folder_dist_output, '_'.join(file_name_filters), const.FILE_EXTENSION ) file_output = open(file_name_output, "w+") # Saves header to output file header = "%s%s" % (const.SEPARATOR, lengths_to_log(array_lens)) print(header, file=file_output) for file_name_sort in os.listdir(folder_dist_input): if not verify_file_name(file_name_sort, file_name_filters, const.FILE_EXTENSION): continue # Reduces sort timings with open("%s%s" % (folder_dist_input, file_name_sort), "r") as file_sort: content = file_sort.read() lines = content.split(const.FILE_NEW_LINE_CHAR)[:-1] timings = [[float(t) for t in l.split(const.SEPARATOR)] for l in lines] timings_reduced = [reduce_func(t, l) for t, l in zip(timings, array_lens)] # Generates sort name sort_name = str(file_name_sort) for file_filter in file_name_filters: sort_name = sort_name.replace(file_filter, "") # Outputs sort timings sort_name = sort_name[:-len(const.FILE_EXTENSION)] sort_name = " ".join(s for s in sort_name.split("_") if s) timings_output = const.SEPARATOR.join(str(t).replace(".", ",") for t in timings_reduced) output = "%s%s%s" % (sort_name, const.SEPARATOR, timings_output) print(output, file=file_output) file_output.close()
def median_solution(week, output_filepath, filepath, solution): log("Store the solution in {}".format(output_filepath), INFO) create_folder(output_filepath) ts = time.time() with open(output_filepath, "wb") as OUTPUT: log("Read {}".format(filepath), INFO) header = True if week < 10: OUTPUT.write("Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,MEDIAN_Demanda_uni_equil\n") with open(filepath) as INPUT: for line in INPUT: if header: header = False else: w, agency_id, channel_id, route_id, client_id, product_id, _, _, _, _, _ = line.strip().split(",") w = int(w) if w == week: prediction_median = get_median(solution[0], solution[1], {COLUMN_AGENCY: agency_id, COLUMN_PRODUCT: product_id, COLUMN_CLIENT: client_id}) OUTPUT.write("{}\n".format(",".join([str(w), agency_id, channel_id, route_id, client_id, product_id, str(prediction_median)]))) else: pass else: OUTPUT.write("id,Demanda_uni_equil\n") with open(filepath, "rb") as INPUT: for line in INPUT: if header: header = False else: row_id, w, agency_id, channel_id, route_id, client_id, product_id = line.strip().split(",") prediction_median = get_median(solution[0], solution[1], {COLUMN_AGENCY: agency_id, COLUMN_PRODUCT: product_id, COLUMN_CLIENT: client_id}) OUTPUT.write("{},{}\n".format(row_id, prediction_median)) te = time.time() log("Cost {:4f} secends to generate the solution".format(te-ts), INFO)
def _complex_split_data(df, time_column, time_id, range_x, range_y, size_x, size_y, output_folder): c1 = (df[time_column] == time_id) folder = os.path.join(output_folder, "{}={}".format(time_column, time_id)) for window_size_x, window_size_y in zip(size_x, size_y): for x in range_x: start_x, end_x = x, min(11, x+window_size_x) c2 = (df["x"].values >= start_x) & (df["x"].values < end_x) for y in range_y: start_y, end_y = y, min(11, y+window_size_y) c3 = (df["y"].values >= start_y) & (df["y"].values < end_y) filepath_output = os.path.join(folder, "windown_size={},{}".format(window_size_x, window_size_y), "{}_{}.csv".format(start_x, start_y)) if not os.path.exists(filepath_output): create_folder(filepath_output) df[c1 & c2 & c3].to_csv(filepath_output, index=False) log("Save file in {}".format(filepath_output), INFO) else: log("Skip {}".format(filepath_output), INFO)
def _place_info(df_all, place_id, output_folder): filepath_output = os.path.join(output_folder, "{}.csv".format(place_id)) if os.path.exists(filepath_output): log("Skip {}".format(filepath_output), INFO) else: df = df_all[df_all["place_id"] == place_id] results = {"place_id": [place_id]} results["left_top"] = ["{},{}".format(df["x"].min(), df["y"].max())] results["right_bottom"] = ["{},{}".format(df["x"].max(), df["y"].min())] results["std_x"] = [df["x"].std()] results["std_y"] = [df["y"].std()] results["count"] = [df.shape[0]] results["centroid"] = ["{},{}".format(df["x"].median(), df["y"].median())] filepath_output = os.path.join(output_folder, "{}.csv".format(place_id)) create_folder(filepath_output) pd.DataFrame.from_dict(results, orient="index").T.to_csv(filepath_output, index=False) log("Save file in {}".format(filepath_output), INFO)
def consumer(ip=IP_BEANSTALK, port=PORT_BEANSTALK, task=COMPETITION_GROUP_NAME, n_jobs=1): df_train = None df_train = pd.read_csv(TRAIN_FILE) log("Load {} completely".format(TRAIN_FILE)) df_test = pd.read_csv(TEST_FILE) log("Load {} completely".format(TEST_FILE)) week = 10 median_route_solution = (load_median_solution(week-1, "route_id", ROUTE_GROUPS), ROUTE_GROUPS) median_agency_solution = (load_median_solution(week-1, "agency_id", AGENCY_GROUPS), AGENCY_GROUPS) talk = beanstalkc.Connection(host=ip, port=port) talk.watch(task) for n in range(0, n_jobs): thread = SplitThread(kwargs={"df_train": df_train, "df_test": df_test, "median_route_solution": median_route_solution, "median_agency_solution": median_agency_solution,"queue": queue}) thread.setDaemon(True) thread.start() while True: job = talk.reserve(timeout=TIMEOUT_BEANSTALK) if job: o = json.loads(job.body) filetype, output_filepaths, column, values = o["filetype"], o["output_filepath"], o["column"], o["value"] output_folder = None for output_filepath, value in zip(output_filepaths, values): output_folder = os.path.dirname(output_filepath) create_folder(output_filepath) queue.put((output_filepath, filetype, column, value)) queue.join() job.delete() queue.join() talk.close()
def hierarchical_folder_structure(column, filetype): prefixs = set() folder = os.path.join(SPLIT_PATH, COLUMNS[column], filetype.lower()) if not os.path.isdir(folder): log("{} is not a folder".format(folder), INFO) return timestamp_start = time.time() for filepath in glob.iglob("{}/*.csv".format(folder)): filename = os.path.basename(filepath) prefix = filename[0:3] prefixs.add(prefix) new_folder = os.path.join(folder, prefix) new_filepath = os.path.join(new_folder, filename) create_folder(new_filepath) os.rename(filepath, new_filepath) log("Move {} to {}".format(filepath, new_filepath), INFO) timestamp_end = time.time() log("Cost {:4f} secends to move files to the sub-folders".format(timestamp_end-timestamp_start), INFO) hostname = socket.gethostname() if hostname != IP_BEANSTALK: timestamp_start = time.time() for prefix in prefixs: filepath = os.path.join(folder, prefix) p = subprocess.Popen(["scp", "-r", filepath, "RungChiChen@{}:\"{}\"".format(IP_BEANSTALK, folder.replace(" ", "\\\\ "))]) pid, sts = os.waitpid(p.pid, 0) log("Transfer {} successfully({})".format(filepath, sts), INFO) timestamp_end = time.time() log("Cost {:4f} secends to copy files to the {}".format(timestamp_end-timestamp_start, IP_BEANSTALK), INFO)
def cache_median(filepath, filetype, week=9, output_folder=MEDIAN_SOLUTION_PATH): df = pd.read_csv(filepath) shape = df.shape df = df[df[COLUMN_WEEK] <= week] new_shape = df.shape log("After filtering, the shape is modified from {} to {}".format(shape, new_shape), INFO) drop_columns = [COLUMN_WEEK, 'Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima', 'Dev_proxima'] df.drop(drop_columns, inplace=True, axis=1) target = {COLUMN_PREDICTION: np.median} groups = None if filetype == MONGODB_COLUMNS[COLUMN_ROUTE]: groups = ROUTE_GROUPS elif filetype == MONGODB_COLUMNS[COLUMN_AGENCY]: groups = AGENCY_GROUPS for group in groups: median = df.groupby(group).agg(target).to_dict() solution = {} for key, value in median[COLUMN_PREDICTION].items(): if isinstance(key, np.int64): solution[str(key)] = value else: solution["_".join([str(s) for s in key])] = value log("There are {} records in median_solution".format(len(solution)), INFO) output_filepath = os.path.join(output_folder, filetype, "week={}".format(week), "{}.json".format("_".join([str(s) for s in group]))) create_folder(output_filepath) with open(output_filepath, "wb") as OUTPUT: json.dump(solution, OUTPUT) log("Write median solution to {}".format(output_filepath), INFO)
def __init__(self): self.options = Options().args self.tester = None temp = 0 for mode in PerformanceTestRunner.modes: if mode in sys.argv: temp += 1 if temp == 0: utils.print_error( 'Cannot determine any kind of request for testing') utils.print_error( 'May be you missing arguments "-a" or "-b" or "-t" or "-l"') sys.exit(1) if temp > 1: utils.force_print_error_to_console( '"-a" and "-g" and "-t" and "-l" ' 'cannot exist at the same time\n') sys.exit(1) self.list_tester = list() self.start_time = self.finish_time = 0 self.lowest = self.fastest = 0 self.passed_req = self.failed_req = 0 self.result_path = os.path.join(os.path.dirname(__file__), 'results') utils.create_folder(self.result_path) log_path = os.path.join(os.path.dirname(__file__), 'logs') utils.create_folder(log_path) now = time.strftime("%d-%m-%Y_%H-%M-%S") self.result_path = os.path.join(self.result_path, 'result_{}.txt'.format(now)) log_path = os.path.join( log_path, self.create_log_file_name()) requests_sender.RequestsSender.init_log_file(log_path) utils.create_folder(self.options.info_dir)
def __init__(self, req_info_file_path=None, log=False): self.log = log self.req_info_file_path = req_info_file_path self.path = os.path.join(os.path.dirname(__file__), 'temp') utils.create_folder(self.path) pass