def contacts(self): group_result = utils.group_by_common_parents(self.email_els, self.phone_els) doc = self.body.document() results = [] for path, els in utils.find_similar_selector_paths([group_parent for email, phone, group_parent in group_result]): for group_parent in utils.get_elements_for_path(doc, path): email_els, emails = self.emails(group_parent) phone_els, phones = self.phones(group_parent) all_links = utils.traverse(group_parent, match_el=lambda el: el.tagName() == 'A') urls = list(set([unicode(a.attribute('href')) for a in all_links if validators.url_no_path_re.match(a.attribute('href'))])) social_urls = [{ 'type': [unicode(name) for name in validators.social_url_re.match(a.attribute('href')).groups() if name][0], 'url': unicode(a.attribute('href')) } for a in all_links if validators.social_url_re.match(a.attribute('href'))] result = { 'emails': emails, 'phones': phones, 'addresses': list(chain(*[ [(lambda city, state, zip_code: {'city': city.strip(), 'state': state, 'zip': zip_code})(*match) for match in matches] for el, matches in utils.traverse_extract(group_parent, match_text=lambda s: validators.address_re.findall(s)) ])), 'urls': urls, 'social_urls': social_urls, } if any(result.values()): results.append(result) return results
def phones(self, parent=None): number_match = lambda t: list(phonenumbers.PhoneNumberMatcher(t, 'US')) phone_els = utils.traverse(parent or self.body, match_text=number_match) phones = [] for el in phone_els: for match in phonenumbers.PhoneNumberMatcher(unicode(el.toPlainText()), 'US'): phones.append({ 'type' : utils.number_type(unicode(el.toPlainText()), match.raw_string), 'raw_number' : match.raw_string, 'number' : utils.format_us_phone_number(match.raw_string), }) return phone_els, phones
def emails(self, parent=None): email_els = utils.traverse(parent or self.body, match_el=lambda el: utils.find_emails(el.attribute('href')), match_text=lambda s: utils.find_emails(s), ignore_tags=[]) return email_els, list(set(chain(*[utils.find_emails(el.attribute('href')) + utils.find_emails(unicode(el.toPlainText())) for el in email_els])))