def __init__(self, db_name, lock): super(Extractor, self).__init__("document", lock) self.entity_cleaner = EntityCleaner() self.db_name = db_name self.cur_db = None
def __init__(self): self.ec = EntityCleaner() self.expected = 'test' self.exclude = list(string.punctuation.replace('@', '').replace('.', ''))
class Extractor(ProcessBase): def __init__(self, db_name, lock): super(Extractor, self).__init__("document", lock) self.entity_cleaner = EntityCleaner() self.db_name = db_name self.cur_db = None def run(self): self.cur_db = sqlite3.connect(self.db_name + '/documents.db') while True: self.running = self.lock.acquire() document_id, path_to_html = self.queue.get() if not self.added: self.timer() self.added = True self.extract_insert_info(document_id, path_to_html) self.running = False self.lock.release() self.queue.task_done() if self.queue.empty(): # Kill the timer self.completed = False end_time = time.time() hours, rem = divmod(end_time - self.start_time, 3600) minutes, seconds = divmod(rem, 60) print("[*] Extraction elapsed time: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) self.cur_db.close() break def clean_insert_entity(self, cursor, doc_id, entity, entity_type): cleaned_entity = self.entity_cleaner.clean(entity) if cleaned_entity is None: return insert_entity(cursor, doc_id, cleaned_entity, entity_type) def extract_insert_info(self, document_id, path_to_html): cur_db_cursor = self.cur_db.cursor() with codecs.open(path_to_html, 'rt', encoding='utf8') as fp: raw_text = fp.read() extracted_emails = self._extract_email(raw_text) for email in extracted_emails: if email.endswith('.png') or email.endswith('.gif'): continue self.clean_insert_entity(cur_db_cursor, document_id, email, 'Email') extracted_phone_nos = self._extract_phone_no(raw_text) for phone_no in extracted_phone_nos: self.clean_insert_entity(cur_db_cursor, document_id, phone_no, 'Phone') # extract using NLP entities = html_ner(raw_text) for tag, data in entities: if tag == 'PERSON': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Name') elif tag == 'ORGANIZATION': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Organisation') elif tag == 'LOCATION': self.clean_insert_entity(cur_db_cursor, document_id, data, 'Location') self.cur_db.commit() @staticmethod def _extract_email(text): results = re.findall(r'[a-z0-9\.]+@[a-z0-9\.]+\.[a-z]{2,}', text, flags=re.IGNORECASE) return set(results) @staticmethod def _extract_phone_no(text): def __phone_sanity_check(no): open_parent = no.count('(') close_parent = no.count(')') if open_parent > 1 or close_parent > 1: return False return open_parent == close_parent results = re.findall(r'\+?[0-9\- \(\)]{8,16}[0-9]', text) phone_numbers = [] for result in results: if __phone_sanity_check(result): try: phone_numbers.append(phonenumbers.parse(result, None)) except phonenumbers.phonenumberutil.NumberParseException: continue phone_numbers = [phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) for phone_number in phone_numbers if phonenumbers.is_possible_number(phone_number)] return phone_numbers
class TestEntityCleaner(object): def __init__(self): self.ec = EntityCleaner() self.expected = 'test' self.exclude = list(string.punctuation.replace('@', '').replace('.', '')) def test_entity_cleaner_exclamation(self): content = self.expected + '!' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_double_quote(self): content = self.expected + '"' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_sharp(self): content = self.expected + '#' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_dollar(self): content = self.expected + '$' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_percent(self): content = self.expected + '%' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_single_quote(self): content = self.expected + '%' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_parentheses(self): content = self.expected + '(){}[]' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_operators(self): content = self.expected + '*+-/' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_colon(self): content = self.expected + ':' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_semi_colon(self): content = self.expected + ';' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_question(self): content = self.expected + '?' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_back_slash(self): content = self.expected + '\\' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected def test_entity_cleaner_special(self): content = self.expected + '^`_|~' cleaned = self.ec.clean(content) assert not any(letter in self.exclude for letter in cleaned) and cleaned == self.expected