def release_dbcfg(cfg, from_path, to_path): tic = time.time() log.info("\nrelease_dbcfg:-----------------------------") timestamp = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) base_to_path = common.getBasePath(to_path) log.info("base_to_path: {}".format(base_to_path)) dbcfg = create_dbcfg(cfg, base_from_path) dbcfg['TIMESTAMP'] = timestamp filepath = save_dbcfg(base_to_path, dbcfg) log.info('\nDone (t={:0.2f}s)\n'.format(time.time() - tic)) return filepath
def remove_ann(json_file): log.info("json_file : {}".format(json_file)) new_json = [] ## annotations below this points will be removed dist = 30 timestamp = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) with open(json_file, 'r') as file: json_lines = file.readlines() res_lane = [] for line_index, val in tqdm.tqdm(enumerate(json_lines), total=len(json_lines)): json_line = json_lines[line_index] sample = json.loads(json_line) lanes = sample['lanes'] # Number of lanes for lane in lanes: count = 0 for lane_id in lane: if lane_id == -2: continue else: count = count + 1 if count > dist: new_json.append(sample) break break json_basepath = common.getBasePath(json_file) json_name = json_file.split('/')[-1] new_json_name = json_name.split('.')[0] with open( json_basepath + '/' + new_json_name + '-filtered-' + timestamp + '.json', 'w') as outfile: for items in new_json: # log.info("items : {}".format(items)) json.dump(items, outfile) outfile.write('\n')
def get_metadata(from_path, task, subset, year): metadata = { "annotation_file": None, "annotation_filepath": None, "image_dir": None, "task": None, "year": None, "base_from_path": None } base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) ## TODO: fix the subset issue if task == "panoptic": annotation_file = "{}_{}{}.json".format(task + "_instances", subset, year) subset = task + "_" + subset else: annotation_file = "{}_{}{}.json".format(task, subset, year) log.info("annotation_file: {}".format(annotation_file)) annotation_filepath = os.path.join(base_from_path, annotation_file) log.info("annotation_filepath: {}".format(annotation_filepath)) if not os.path.exists(annotation_filepath): raise Exception( "File: {} does not exists!".format(annotation_filepath)) if subset == "minival" or subset == "valminusminival": subset = "val" image_dir = "{}/{}{}".format(base_from_path, subset, year) log.info("image_dir: {}".format(image_dir)) metadata['task'] = task metadata['year'] = year metadata['base_from_path'] = base_from_path metadata['annotation_file'] = annotation_file metadata['annotation_filepath'] = annotation_filepath metadata['image_dir'] = image_dir return metadata
def via_to_tusimple(json_file): with open(json_file, 'r') as file: new_json = [] timestamp = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) via = json.load(file) for line_index, val in enumerate(via.values()): tusimple = {"lanes": []} lanes = [] rawfile = val['filename'] r = val["regions"] resize = 1.5 for j in r: lane = [] x_axis = j["shape_attributes"]["all_points_x"] # print("x_axis : {}".format(x_axis)) resized_x_axis = [int(x / resize) for x in x_axis] # print("resized_x_axis : {}".format(resized_x_axis)) y_axis = j["shape_attributes"]["all_points_y"] # print("y_axis : {}".format(y_axis)) resized_y_axis = [int(y / resize) for y in y_axis] # print("resized_y_axis : {}".format(resized_y_axis)) lane.append(resized_x_axis) lane.append(resized_y_axis) lanes.append(lane) tusimple["lanes"] = lanes tusimple["raw_file"] = rawfile new_json.append(tusimple) json_basepath = common.getBasePath(json_file) json_name = json_file.split('/')[-1] new_json_name = json_name.split('.')[0] with open(json_basepath + '/' + new_json_name + '-' + timestamp + '.json', 'w') as outfile: for items in new_json: json.dump(items, outfile) outfile.write('\n') print("Done!!") print("Saved in path -> {}".format(json_basepath + '/' + new_json_name + '-' + timestamp + '.json'))
def copy_images(json_file): assert os.path.exists(json_file),'{:s} not exists'.format(json_file) print("json_file : {}".format(json_file)) base_path = getBasePath(json_file) # base_path = os.path.join(os.path.dirname(json_file),'') print("base_path : {}".format(base_path)) path = os.path.join(base_path,'test_images') print("path : {}".format(path)) mkdir_p(path) with open(json_file,'r') as json_file: json_lines = json_file.readlines() images = [] for line_index,val in enumerate(json_lines): # print(line_index) json_line = json_lines[line_index] sample = json.loads(json_line) lanes = sample['lanes'] image = sample['raw_file'] images.append(image) print(len(images)) for im in images: # print(im) shutil.copy(im,path)
def via_to_tusimple(json_file): with open(json_file, 'r') as file: new_json = [] timestamp = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) json_lines = file.readlines() line_index = 0 # Iterate over each image while line_index < len(json_lines): json_line = json_lines[line_index] tusimple = { "lanes" : [] } lanes = [] sample = json.loads(json_line) x_axis = sample['x_axis'] y_axis = sample['y_axis'] rawfile = sample['image_name'] for i in range(len(x_axis)): lane = [] lane.append(x_axis[i]) lane.append(y_axis[i]) lanes.append(lane) tusimple["lanes"] = lanes tusimple["raw_file"] = rawfile new_json.append(tusimple) line_index += 1 json_basepath = common.getBasePath(json_file) json_name = json_file.split('/')[-1] new_json_name = json_name.split('.')[0] with open(json_basepath+'/'+new_json_name+'-'+timestamp+'.json','w') as outfile: for items in new_json: json.dump(items, outfile) outfile.write('\n') print("Done!!") print("Saved in path -> {}".format(json_basepath+'/'+new_json_name+'-'+timestamp+'.json'))
def split_train_test(json_file): assert ops.exists(json_file), '{:s} not exist'.format(json_file) base_path = common.getBasePath(json_file) print("base_path : {}".format(json_file)) train_dir = ops.join(base_path, "train") test_dir = ops.join(base_path, "test") print("train_dir : {}".format(train_dir)) print("test_dir : {}".format(test_dir)) # os.makedirs(train_dir, exist_ok=True) # os.makedirs(test_dir, exist_ok=True) res_lanes = { '0_lanes': 0, '1_lanes': 0, '2_lanes': 0, '3_lanes': 0, '4_lanes': 0, '5_lanes': 0, '6_lanes': 0 } no_of_lanes = { "zero_lane": [], "one_lane": [], "two_lane": [], "three_lane": [], "four_lane": [], "five_lane": [], "six_lane": [] } train = [] test = [] lines = [] with open(json_file, 'r') as file: json_lines = file.readlines() # Iterate over each image for line_index, val in enumerate(json_lines): json_line = json_lines[line_index] sample = json.loads(json_line) # image_name = sample['raw_file'] lanes = sample['lanes'] res_lane = [] for lane in lanes: lane_id_found = False for lane_id in lane: if lane_id == -2: continue else: lane_id_found = True break if lane_id_found: res_lane.append(lane) if len(res_lane) == 0: # res_lanes['0_lanes']=res_lanes['0_lanes']+1 no_of_lanes["zero_lane"].append(json_line) res_lanes['0_lanes'] = len(no_of_lanes["zero_lane"]) elif len(res_lane) == 1: no_of_lanes["one_lane"].append(json_line) res_lanes['1_lanes'] = len(no_of_lanes["one_lane"]) elif len(res_lane) == 2: # res_lanes['2_lanes']=res_lanes['2_lanes']+1 no_of_lanes["two_lane"].append(json_line) res_lanes['2_lanes'] = len(no_of_lanes["two_lane"]) elif len(res_lane) == 3: # res_lanes['3_lanes']=res_lanes['3_lanes']+1 no_of_lanes["three_lane"].append(json_line) res_lanes['3_lanes'] = len(no_of_lanes["three_lane"]) elif len(res_lane) == 4: # res_lanes['4_lanes']=res_lanes['4_lanes']+1 no_of_lanes["four_lane"].append(json_line) res_lanes['4_lanes'] = len(no_of_lanes["four_lane"]) elif len(res_lane) == 5: # res_lanes['5_lanes']=res_lanes['5_lanes']+1 no_of_lanes["five_lane"].append(json_line) res_lanes['5_lanes'] = len(no_of_lanes["five_lane"]) elif len(res_lane) == 6: # res_lanes['6_lanes']=res_lanes['6_lanes']+1 no_of_lanes["six_lane"].append(json_line) res_lanes['6_lanes'] = len(no_of_lanes["six_lane"]) print(res_lanes) for i, k in enumerate(no_of_lanes.keys()): key = no_of_lanes[k] random.shuffle(key) tmp_train = [] tmp_test = [] split_size = int(0.85 * len(key)) # print("split_size :{}".format(split_size)) if split_size > 0: tmp_train = key[:split_size] tmp_test = key[split_size:] train.append(tmp_train) test.append(tmp_test) train = [item for sublist in train for item in sublist] print('train : {}'.format(len(train))) test = [item for sublist in test for item in sublist] print('test : {}'.format(len(test))) with open(train_dir + "-tusimple" + ".json", 'w') as outfile: for item1 in train: outfile.write(item1) with open(test_dir + "-tusimple" + ".json", 'w') as outfile: for item2 in test: outfile.write(item2)
def merge_ann(cfg, from_path, to_path, move_file=False): """copy annotation data from multiple folders to a single destination folder """ tic = time.time() log.info("\nrelease_anndb:-----------------------------") timestamp = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) base_to_path = common.getBasePath(to_path) log.info("base_to_path: {}".format(base_to_path)) ## Get only top level directories ## Ref: ## https://stackoverflow.com/questions/141291/how-to-list-only-top-level-directories-in-python ## https://stackoverflow.com/questions/4568580/python-glob-multiple-filetypes aijobs = next(os.walk(base_from_path))[1] aijobs_path = [os.path.join(base_from_path,x) for x in aijobs] exts = appcfg['ALLOWED_IMAGE_TYPE'] files_to_copy = {x:{ 'annotations':glob.glob(os.path.join(base_from_path, x, 'annotations', '*.json')) # ,'images': [item for sublist in [glob.glob(os.path.join(base_from_path, x, 'images') + '/*/*'+ext) for ext in exts for x in aijobs ] for item in sublist] } for x in aijobs} images_annotated = { 'files':[] ,'unique':set() ,'not_found':set() } stats = {} groups = [] IMAGE_API = cfg['IMAGE_API'] USE_IMAGE_API = IMAGE_API['ENABLE'] SAVE_LOCAL_COPY = True # NO_OF_ANNON_FILES_THRESHOLD = 5 for i, x in enumerate(files_to_copy): log.info("\n[{}]x:-----------------------------{}".format(i,x)) for y in files_to_copy[x]: log.info("y:-------{}".format(y)) filepaths = files_to_copy[x][y] if y not in stats: stats[y] = {'count':0, 'unique':set(), 'total':0 } groups.append(y) stats[y]['total'] += len(filepaths) for j, src_filepath in enumerate(filepaths): index = -1 if y=='annotations' else -2 filename = os.path.basename(src_filepath) ## if annotations, read it fetch images from it if y == 'annotations': with open(src_filepath,'r') as fr: ref = annonutils.parse_annon_filename(src_filepath) annotations = json.load(fr) annon_file_name = {} for ak,av in annotations.items(): # imgpath, base_path_img = annonutils.getImgPath(base_from_path, ref['image_dir']) base_path_img = os.path.join(base_to_path, 'images', ref['image_dir']) filepath_img = os.path.join(base_path_img, av['filename']) if av['filename'] not in annon_file_name: annon_file_name[av['filename']] = { 'annotations': [] ,'imagename': av['filename'] ,'metadata': {} ,'image_dir': ref['image_dir'] } annon_file_name[av['filename']]['annotations'].append(av['regions']) if USE_IMAGE_API: get_img_from_url_success = annonutils.get_image_from_url(IMAGE_API, av['filename'], base_path_img, save_local_copy=SAVE_LOCAL_COPY) if get_img_from_url_success: images_annotated['files'].append(av['filename']) images_annotated['unique'].add(av['filename']) else: images_annotated['not_found'].add(av['filename']) basedir = os.path.sep.join(os.path.dirname(src_filepath).split(os.path.sep)[index:]) dst_to_basedir = os.path.join(base_to_path, basedir) stats[y]['unique'].add(filename) stats[y]['count'] += 1 log.info("stats[y]['count']:{}, [x:j]:[{}:{}]: Exists: {} \n src_filepath: {}".format(stats[y]['count'], x, j, os.path.exists(src_filepath), src_filepath)) log.info("basedir: {}".format(basedir)) log.info("dst_to_basedir: {}".format(dst_to_basedir)) # ## Ref: https://www.pythoncentral.io/how-to-copy-a-file-in-python-with-shutil/ common.mkdir_p(dst_to_basedir) shutil.copy2(src_filepath, dst_to_basedir) for g in groups: stats[g]['unique'] = len(stats[g]['unique']) stats['images_annotated'] = { 'files': len(images_annotated['files']) ,'unique': len(images_annotated['unique']) ,'not_found': len(images_annotated['not_found']) } log.info("\nstats: {}".format(stats)) log.info('\nDone (t={:0.2f}s)\n'.format(time.time()- tic))
def release_db(cfg, args): """Entry point to parse VIA based annotations for creating and saving basic data structures - IMAGES, ANNOTATIONS, LABELS and related data Implements the DRC - Design Rule Checks and acts as a gatekeeper, also reports any possible errors Create data structures to be parsed in 2nd pass to create the AIDS - AI Datasets with the actual splits Test Cases: ## /some/path/AIML_Annotation/ods_job_230119/annotations/images-p1-230119_AT1_via205_250119.json ## /some/path/AIML_Annotation/ods_job_230119/annotations/ ## /some/path/AIML_Annotation/ods_job_230119/annotations/ """ ## Check required args for d in ['from_path']: if d not in args: log.info("'{}' is not present.\n".format(d)) sys.exit(-1) if not os.path.exists(args.from_path): raise NotADirectoryError("{}".format(args.from_path)) dbname = None if 'to_path' in args and not os.path.exists(args.to_path): dbname = args.to_path from_path = args.from_path tic = time.time() log.info("\nrelease_db:-----------------------------") base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) uuid_rel = common.createUUID('rel') timestamp = cfg['RELEASE']['COLS']['timestamp'] = cfg['LOG']['COLS']['timestamp'] = cfg['TIMESTAMP'] cfg['RELEASE']['COLS']['rel_id'] = cfg['LOG']['COLS']['rel_id'] = uuid_rel cfg['SAVE_TO_FILE'] = False log.info("-------") log.info("cfg: {}".format(cfg)) if os.path.isdir(from_path): ## normalizes and takes care of path ending with slash or not as the user input files = glob.glob(os.path.join(base_from_path, cfg['ANNON_FILENAME_PREFIX']+'*.json')) else: files = [from_path] total_files = len(files) log.info("-------") log.debug("\nfiles: {}".format(files)) log.info("-------") log.info("\nTotal files to process =======>: {}".format(total_files)) total_annon_file_processed = 0 total_annon_file_existed = 0 DBCFG = cfg['DBCFG'] ANNONCFG = DBCFG['ANNONCFG'] mclient = MongoClient('mongodb://'+ANNONCFG['host']+':'+str(ANNONCFG['port'])) dbname = ANNONCFG['dbname'] if not dbname else dbname log.info("dbname: {}".format(dbname)) db = mclient[dbname] rel_tblname = annonutils.get_tblname('RELEASE') annonutils.create_unique_index(db, rel_tblname, 'rel_id') rel_collection = db.get_collection(rel_tblname) log_tblname = annonutils.get_tblname('LOG') annonutils.create_unique_index(db, log_tblname, 'created_on') log_collection = db.get_collection(log_tblname) for annon_filepath in files: log.info("-------") tic2 = time.time() annon_filename = os.path.basename(annon_filepath) ## check if the file is parsed: skip the processing in normal mode of the already parsed file # res = log_collection.find_one({'rel_filename': annon_filename}) res = log_collection.find_one({'rel_filepath': annon_filepath}) ## TODO: in update mode ## delete the entries of annotations and images before inserting the values of the same file again if not res: log.info(" annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) created_on = common.now() cfg['RELEASE']['COLS']['created_on'] = cfg['LOG']['COLS']['created_on'] = created_on log.info("created_on: {}".format(created_on)) cfg['LOG']['COLS']['rel_filename'] = annon_filename cfg['LOG']['COLS']['rel_filepath'] = annon_filepath annondata = annon_parser.parse_annon_file(cfg, annon_filepath, base_from_path) total_annon_file_processed += 1 save_parsed_data(cfg, annondata, db=db) cfg['LOG']['COLS']['modified_on'] = None toc2 = time.time() cfg['LOG']['COLS']['total_exec_time'] = '{:0.2f}s'.format(toc2 - tic) ## if exception occurs or terminate, save what has been processed so for in the log instead of one-shot update of log out of for loop ## this helps to recover from the abrupt termination and start from previous successfuly processed file log_collection.update_one( {'created_on': created_on} ,{'$setOnInsert': cfg['LOG']['COLS']} ,upsert=True ) log.info("=======> Total Execution Time: {:0.2f}s, Processed files: {}, Remaning files: {}".format(toc2 - tic2, total_annon_file_processed, total_files - total_annon_file_processed)) ## Update the LOG table here itself else: log.info("Already Exist in Database: annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) log.info("Use update / delete command to process this file again") total_annon_file_existed += 1 cfg['RELEASE']['COLS']['total_annon_file_processed'] = total_annon_file_processed # cfg['RELEASE']['COLS']['total_exec_time'] = '{:0.2f}s'.format(time.time() - tic) cfg['RELEASE']['COLS']['total_exec_time_in_sec'] = '{:0.2f}'.format(time.time() - tic) if total_annon_file_processed: rel_collection.update_one( {'rel_id': uuid_rel} ,{'$setOnInsert': cfg['RELEASE']['COLS']} ,upsert=True ) log.info("total_files, total_annon_file_processed, total_annon_file_existed: {} = {} + {}".format(total_files, total_annon_file_processed, total_annon_file_existed)) mclient.close() return timestamp
def release_files(cfg, args): """Entry point to parse VIA based annotations for creating and saving basic data structures - IMAGES, ANNOTATIONS, LABELS and related data Implements the DRC - Design Rule Checks and acts as a gatekeeper, also reports any possible errors Create data structures to be parsed in 2nd pass to create the AIDS - AI Datasets with the actual splits Test Cases: ## /some/path/AIML_Annotation/ods_job_230119/annotations/images-p1-230119_AT1_via205_250119.json ## /some/path/AIML_Annotation/ods_job_230119/annotations/ ## /some/path/AIML_Annotation/ods_job_230119/annotations/ """ ## Check required args for d in ['from_path', 'to_path']: if d not in args: log.info("'{}' is not present.\n".format(d)) sys.exit(-1) if not os.path.exists(args.from_path): raise NotADirectoryError("{}".format(args.from_path)) if not os.path.exists(args.to_path): raise NotADirectoryError("{}".format(args.to_path)) from_path, to_path = args.from_path, args.to_path tic = time.time() log.info("\nrelease_db:-----------------------------") cfg['TIMESTAMP'] = ("{:%d%m%y_%H%M%S}").format(datetime.datetime.now()) base_from_path = common.getBasePath(from_path) log.info("base_from_path: {}".format(base_from_path)) base_to_path = common.getBasePath(to_path) log.info("base_to_path: {}".format(base_to_path)) cfg['LOG']['COLS']['timestamp'] = cfg['TIMESTAMP'] ## Create Base Directories db_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['DB']) log.info("db_dir: {}".format(db_dir)) common.mkdir_p(db_dir) db_data_dir = os.path.join(db_dir, cfg['TIMESTAMP']) log.info("ANNDB db_data_dir: {}".format(db_data_dir)) common.mkdir_p(db_data_dir) rel_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['RELEASE'], cfg['TIMESTAMP']) log.info("rel_dir: {}".format(rel_dir)) common.mkdir_p(rel_dir) log_dir = os.path.join(base_to_path, cfg['BASEDIR_NAME']['LOG']) log.info("log_dir: {}".format(log_dir)) common.mkdir_p(log_dir) ant_data_dir = os.path.join(db_data_dir,cfg["BASEDIR_NAME"]["ANNON"]) log.info("ant_data_dir: {}".format(ant_data_dir)) common.mkdir_p(ant_data_dir) cfg['BASE_PATH']['DB_DIR'] = db_dir cfg['BASE_PATH']['DB_DATA_DIR'] = db_data_dir cfg['BASE_PATH']['RELEASE_DIR'] = rel_dir cfg['BASE_PATH']['LOG_DIR'] = log_dir cfg['BASE_PATH']['ANT_DATA_DIR'] = ant_data_dir log.info("-------") log.info("cfg: {}".format(cfg)) if os.path.isdir(from_path): ## normalizes and takes care of path ending with slash or not as the user input files = glob.glob(os.path.join(base_from_path,cfg['ANNON_FILENAME_PREFIX']+'*.json')) else: files = [from_path] log.info("-------") log.info("\nfiles: {}".format(files)) log.info("-------") log.info("\nTotal files to process =======>: {}".format(len(files))) total_annon_file_processed = 0 log_tblname = annonutils.get_tblname('LOG') for annon_filepath in files: log.info("-------") tic2 = time.time() annon_filename = os.path.basename(annon_filepath) ## TODO: check if the file is parsed: skip the processing in normal mode of the already parsed file res = False ## TODO: in update mode ## delete the entries of annotations and images before inserting the values of the same file again if not res: log.info(" annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) created_on = cfg['LOG']['COLS']['created_on'] = common.now() log.info("created_on: {}".format(created_on)) cfg['LOG']['COLS']['rel_filename'] = annon_filename cfg['LOG']['COLS']['rel_filename'] = annon_filepath annondata = annon_parser.parse_annon_file(cfg, annon_filepath, base_from_path) total_annon_file_processed += 1 # ## if the annon_filepath is absolute path, base_bast gets ignored and thus the dst_dir is the file's directory ## dst_dir= os.path.join(base_from_path,os.path.splitext(annon_filepath)[0]) ## log.info("annon_filepath: {}".format(annon_filepath)) ## dst_dir = os.path.join(db_dir,os.path.splitext(annon_filepath)[0]) ## dst_dir = os.path.join(db_dir,os.path.splitext(annon_filepath)[0]) dst_dir = os.path.join(rel_dir, os.path.splitext(annon_filename)[0]) ## log.info("dst_dir: {}".format(dst_dir)) common.mkdir_p(dst_dir) save_parsed_data(cfg, annondata, dst_dir=dst_dir, ant_data_dir=ant_data_dir, annon_filepath=annon_filepath) cfg['LOG']['COLS']['modified_on'] = None toc2 = time.time() total_exec_time = '{:0.2f}s'.format(toc2 - tic) cfg['LOG']['COLS']['total_exec_time'] = total_exec_time ##TODO: ## if exception occurs or terminate, save what has been processed so for in the log instead of one-shot update of log out of for loop ## this helps to recover from the abrupt termination and start from previous successfuly processed file log.info("=======> Total Execution Time: {:0.2f}s, Processed files: {}, Remaning files: {}".format(toc2 - tic2, total_annon_file_processed, len(files) - total_annon_file_processed)) ## Update the LOG table here itself else: log.info("Already Exist in Database: annon_filename: {} \n annon_filepath: {}".format(annon_filename, annon_filepath)) log.info("Use update / delete command to process this file again") ## Every execution of the script is a release ## For every release, recreate the entire database either for directory or specific file release ## create and save db data i.e. consolidated data with index structure db_data = create_db_data(cfg, rel_dir) save_db_data(cfg, db_dir, db_data) log.info("total_annon_file_processed: {}".format(total_annon_file_processed)) return db_data_dir