parser.add_argument('--source', type=str, default='fgr', help='fgr') args = parser.parse_args() if args.dataset is None: raise ValueError('Must specify dataset, e.g. redwood or scannet, etc.') if args.source is None: raise ValueError('Must specify input source, e.g. fgr or Super4PCS, etc.') home = env() dataset = args.dataset source = args.source pathlib.Path('%s/relative_pose/summary/%s/%s' % (home, dataset, source)).mkdir( exist_ok=True, parents=True) reader = Reader() PATH_SUMMARY = '%s/relative_pose/summary/{}/{}/{}.mat' % home for sceneid in reader.list_scenes(dataset): scanids = reader.get_scanids(dataset, sceneid) output_mat = PATH_SUMMARY.format(dataset, source, sceneid) if os.path.exists(output_mat): continue n = len(scanids) scanid_map = {str(scanid): i for i, scanid in enumerate(scanids)} T = np.zeros((n*4, n*4)) sigma = np.zeros((n, n)) aerr = np.zeros((n, n)) + 10000000.0 terr = np.zeros((n, n)) + 10000000.0 RLlist = reader.list_relative_poses(dataset, source, sceneid)
def preprocess_dataset(dataset_info): """ Preprocess the file information and insert into the database. The file type could be csv, txt and xls. The file information should hardcord in the config file. The function allow a increnmental way adding information to the database. @param dataset_info: The preprocess dataset information list. Each item in the list is a dictionary which contain the dataset name and all the insert task names. The insert task name should define in the config. @return: None """ for info in dataset_info: dataset_name, insert_tasks = info["dataset"], info["insert_tasks"] # get dataset preprocess config and basic information config = get_preprocess_config(dataset_name, insert_tasks) print("dataset: ", dataset_name) dataset = db[dataset_name] # delete all the data in the current dataset, may uncomment when developing # delete_all_date(dataset) # get all the patient id in the current dataset all_patient_id = { patient_id["patient_id"] for patient_id in query_field(dataset, field={ "_id": 0, "patient_id": 1 }) } # get the raw data for increnmental adding raw_data = { result["patient_id"]: { field: result[field] for field in result if field != "patient_id" } for result in query_field(dataset) } data = defaultdict(lambda: dict()) # for each sub dataset task for insert_task in insert_tasks: # get sub dataset basic information filenames = config[insert_task]["filename"] fields = config[insert_task]["select_column"] # ASSUMPTION: all the insert task has field patient_id and the meaning is the same. # D1NAMO break the assumption and will adhoc get the patient id from file name. patient_idx = sum( [i for i in range(len(fields)) if fields[i] == "patient_id"]) for filename in filenames: # get the file real path file = os.path.join( os.path.join(config["root_dir"], config["dataset"]), filename) print("processing file", file) # ASSUMPTION: all the file type in the insert task is the same. # get the file reader and line count if config[insert_task]["file_type"] == "xls": cnt = line_count_xls(file) readable = Reader( xlrd.open_workbook(file).sheets()[0], config[insert_task]["file_type"]) # file type is txt or csv else: cnt, readable = line_count(file), Reader( open(file), config[insert_task]["file_type"]) # use tqdm to show the process progress with tqdm(total=cnt) as bar: for line_cnt in range(cnt): # get file content line = readable.readline() # if the line is not the header if line_cnt != 0: # get patient_id if dataset_name == "D1NAMO": patient_id = int(file.split("/")[-2]) else: patient_id = str(int(float(line[patient_idx]))) # if the patient id is not in the dataset, add this patient to the database. if patient_id not in all_patient_id: insert_one_data(dataset, {"patient_id": patient_id}) all_patient_id.add(patient_id) # get line timestamp. if there is no timestamp, it will be 0 timestamp = 0 if "datetime" in fields: timestamp += sum( datetime_to_int( line[i], config[insert_task] ["basedate"], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "datetime") else: if "date" in fields: timestamp += sum( date_to_int( line[i], config[insert_task] ["basedate"], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "date") if "timestamp" in fields: timestamp += sum( timestamp_to_int( line[i], config[insert_task] ["pattern"]) for i in range(len(fields)) if fields[i] == "timestamp") row_combine_field = dict() for idx in range(len(line)): if idx >= len(line): continue content, field = line[idx], config[ insert_task]["select_column"][idx] # if the field should not append or there is no content in the line, continue if field == '' or len(content) == 0: continue # if the field is patient_id or timestamp related, continue if field in { "patient_id", "datetime", "date", "timestamp" }: continue # if the field is a status, the field content will not store in list style. if field in status_field_set: # adhoc for field trouble_sleep_inverse if field == "trouble_sleep_inverse": data[patient_id][ "trouble_sleep"] = str( 5 - int(content)) # adhoc for field low_gl elif field == "low_gl": data[patient_id][ "low_gl"] = content.split(" ")[0] else: data[patient_id][field] = content # adhoc for field weight_units (weight should in data before) elif field == "weight_units": if content == "lbs": data[patient_id]["weight"] = str( LBS_TO_KG * float(data[patient_id]["weight"])) # if the field is need store with timestamp elif field in timestamp_field_set: # adhoc for field raw_gl if field == "raw_gl": content = str(float(content) * 18) field = "gl" # if field not in patient's data, initial from raw data in database if field not in data[patient_id]: data[patient_id][field] = \ list() if patient_id not in raw_data or field not in raw_data[patient_id] \ else raw_data[patient_id][field] # append the content with timestamp data[patient_id][field].append( [content, timestamp]) # if the field needs to combine to another field elif field in combine_field_set: combine_field = combine_field_set[field] if combine_field not in row_combine_field: row_combine_field[combine_field] = 0 row_combine_field[combine_field] += float( content) # for the common field, store in list style else: # if field not in patient's data, initial from raw data in database if field not in data[patient_id]: data[patient_id][field] = \ list() if patient_id not in raw_data or field not in raw_data[patient_id] \ else raw_data[patient_id][field] data[patient_id][field].append(content) # ASSUMPTION: the combine field is the common field (not status or store with timestamp) for field in row_combine_field: if field not in data[patient_id]: data[patient_id][field] = list() data[patient_id][field].append( str(row_combine_field[field])) # update the progress bar bar.update() # insert the preprocessed data to the database print("start to insert data to:", dataset_name) start = time.clock() for patient_id in data: for field in data[patient_id]: # update the field in the database update_data(dataset, {"patient_id": patient_id}, {'$set': { field: data[patient_id][field] }}) print("use time to insert:", time.clock() - start)
def __init__(self, options): self.reader = Reader(options.data_dir, options.data_augment) self.options = options