def read_data_sources(self, folder_paths): logging.info("Reading data sources...") for folder_name in folder_paths: folder_path = os.path.join(self.data_folder, folder_name) logging.info("-->folder: {}".format(folder_path)) source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in os.listdir(data_folder_path): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue logging.info(" ...file: {}".format(filename)) print(filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map
def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning("Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext(os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json(os.path.join(model_folder_path, filename)) else: print source source.read_semantic_type_from_gold(os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print len(semantic_type_set) print attr_count
def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "tables") model_folder_path = os.path.join(folder_path, "models") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if ('rowNumber' in source.column_map): del source.column_map['rowNumber'] # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning( "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print(len(semantic_type_set)) print(attr_count)