class Extractor: """ """ def __init__(self, db_table): """Initializing """ self.db_manager = DBManager() self.db_table = db_table self.size = 0 self.features = pd.DataFrame() self.extracted_data = dict() self.common_keys = dict() def extract_features(self, lookup): """ Extract new features from DB. Mapping each feature with his data (from DB). Handling missing data. Clean (if choose it) sparse features by THRESHOLD Normalize features data (LabelEncoder) Close DB connection by DBManager :return: """ self.build(lookup) self.extract() self.data_handling(clean_sparse=False) self.db_manager.db_close() def build(self, entities): """ :param entities: :return: """ try: self.size = len(entities) self.prepare_database() for entity in entities: if entity: self.fetch( Utils.normalize_name(Utils.normalize_uri(entity))) except Error as e: print("Error while build features", e) def fetch(self, entity): """ Fetch from database the entity data (if exists) :param entity: :return: """ # return a list of items result = self.db_manager.db_fetch(self.db_table, entity) if result and result is not None and len(result) > 0: for item in result: if 'name' in item.keys() and item['name']: self.extracted_data[item['name']] = item if item['data']: self.prepare_common_keys(item['data']) def prepare_common_keys(self, item): """ :param item: :return: """ for key in item.keys(): if key in self.common_keys: self.common_keys[key] = self.common_keys[key] + 1 else: self.common_keys[key] = 1 def extract(self): """ :return: """ for feature in self.common_keys.keys(): values = [] for i in range(0, self.size): values.append(None) for i, name in enumerate(self.extracted_data): entity = self.extracted_data.get(name) if feature in entity['data'].keys(): values[i] = entity['data'][feature] self.features[feature] = values def data_handling(self, clean_sparse=True): """ Clean sparse features (if it choose) Handling missing data Fit & Transform data (Normalize) :param clean_sparse: :return: """ if clean_sparse: self.clean_sparse_features() le = preprocessing.LabelEncoder() for feature, feature_data in self.features.items(): feature_type = Utils.find_feature_type(self.features[feature]) values = Utils.init_values(feature_type, self.size) for i, value in enumerate(feature_data): if feature_type == list: values[i] = 0 if value is None else len(value) else: if type(value) == list: value = value[0] values[i] = "" if value is None else value.replace( "\"", "") feature_type = Utils.find_feature_type(values) values = Utils.missing_data(feature_type, values) values = le.fit_transform(values) self.features[feature] = values def clean_sparse_features(self): """ Clean sparse features by THRESHOLD :return: """ # Common features sorted var = { k: v for k, v in reversed( sorted(self.common_keys.items(), key=lambda item: item[1])) } for feature in self.features.keys(): values_count = self.features[feature].value_counts() if round(len(values_count) / len(self.features[feature]), 2) < THRESHOLD: self.features.drop(feature, 1, inplace=True) def prepare_database(self): """ Check if DB is connected, else establish a connection :return: """ if self.db_manager and self.db_manager is not None and self.db_manager.is_connected( ): return True elif self.db_manager is not None and not self.db_manager.is_connected( ): self.db_manager.db_connect() return self.db_manager.is_connected() return False
class Parser: """ Parser Freebase triples into DB tables according to entity type """ def __init__(self): """ Initializing """ self.db_manager = DBManager() def init_database(self): """ :return: """ if self.db_manager is not None and not self.db_manager.is_connected(): self.db_manager.db_connect() self.db_manager.db_init() return self.db_manager.is_connected() return False def read_data(self, file): """ :param file: :return: """ try: iTotal = 0 current_mid = "" current_topic = dict() with gzip.open(file, 'rt') as f: for line in f: subject, predicate, object = Utils.parse_triple(line) if subject == current_mid: if predicate not in current_topic: current_topic[predicate] = [object] else: current_topic[predicate].append(object) elif current_mid: self.prepare_to_save(subject, current_topic) current_topic.clear() current_mid = subject iTotal = iTotal + 1 if 0 == (iTotal % 1000000): print("iTotal: ", iTotal) print() except Error as e: print("Error while reading file", e) def prepare_to_save(self, subject, current_topic): """ :param subject: :param current_topic: :return: """ if '/type/object/type' in current_topic: for iType in current_topic['/type/object/type']: for allowed_type_key, allowed_type_table in ALLOWED_ENTITIES.items( ): if re.search(allowed_type_key, iType): # Save to DB if self.prepare_database(): self.save_to_database(allowed_type_table, subject, current_topic) break def prepare_database(self): """ :return: """ if self.db_manager and self.db_manager is not None and self.db_manager.is_connected( ): return True elif self.db_manager is not None and not self.db_manager.is_connected( ): self.db_manager.db_connect() return self.db_manager.is_connected() return False def save_to_database(self, table, subject, current_topic): """ :param table: :param subject: :param current_topic: :return: """ current_topic = Utils.handle_duplicate(current_topic) current_topic = Utils.handle_language(current_topic) if current_topic is None or len(current_topic) == 0: return name = '' if '/type/object/name' in current_topic.keys(): if isinstance(current_topic['/type/object/name'], list): name = current_topic['/type/object/name'][0] else: name = current_topic['/type/object/name'] current_topic.pop('/type/object/name') elif 'label' in current_topic.keys(): if isinstance(current_topic['label'], list): name = current_topic['label'][0] else: name = current_topic['label'] current_topic.pop('label') if name and current_topic is not None and len(current_topic) > 0: name = Utils.clean_lang(name) self.db_manager.db_insert(table, name, subject, json.dumps(current_topic))