def __fetch_linkedin_c_data(self): if not self.domain.linkedin_c: return try: data = fetch(self.domain.linkedin_c, True) except: logger.debug('Can\'t fetch LinkedIn data for {}'.format(self.domain.linkedin_c)) return try: tree = html.fromstring(data) except: logger.debug('Can\'t create LXML tree for {}'.format(self.domain.linkedin_c)) return self.domain.linkedin_c_short_desc = get_from_list_by_index(tree.xpath(data, '//*[@id="body"]/div[2]/div[1]/div[1]/div/p[1]/text()'), 0) self.domain.linkedin_c_followers = get_from_list_by_index(tree.xpath(data, '//*[@id="biz-follow-mod"]/div/div/p/strong/text()'), 0) self.domain.linkedin_c_spe = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[2]/div[2]/div/p/text()'), 0) self.domain.linkedin_c_website = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[1]/p/a/text()'), 0) self.domain.linkedin_c_industry = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[2]/p/text()'), 0) self.domain.linkedin_c_address = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[4]/p/span[1]/text()'), 0) self.domain.linkedin_c_comp_size = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[5]/p/text()'), 0) self.domain.linkedin_c_founded = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[2]/div[2]/ul/li[6]/p/text()'), 0)
def __init__(self, name, proxy_always=False, links_content=('about', 'contact', 'info'), emails_black_list=tuple(), *args, **kwargs): super().__init__(name) if not isinstance(links_content, tuple): raise TypeError('Invalid links content argument type. Can\'t create Cache Walker instance.') if not isinstance(emails_black_list, tuple): raise TypeError('Invalid emails black list argument type. Can\'t create Cache Walker instance.') self.proxy_always = proxy_always self.pages = list() root_url = 'http://{}'.format(self.domain.get_name()) try: home_page_data = fetch(root_url, self.proxy_always) except: logger.error('Can\'t fetch home page for \'{}\'. Domain is not active'.format(self.domain.get_name())) return self.domain.active = True self.pages.append(home_page_data) links = self.get_similar_links(home_page_data, root_url, links_content) for l in links: try: self.pages.append(fetch(l, self.proxy_always)) except: logger.warning('Can\'t fetch data from {}.'.format(l)) continue self.domain.gapps = self.check_gapps(self.domain.get_name()) self.domain.emails = set(self.parse_by_regex('([\w\-]+@(?:(?!example)[\w\-]+\.)+[a-z]+)')) - set(emails_black_list) self.domain.facebook = get_from_list_by_index(self.parse_by_regex(FACEBOOK_LINKS_PATTERN), 0) self.domain.twitter = get_from_list_by_index(self.parse_by_regex(TWITTER_LINKS_PATTERN), 0) self.domain.linkedin_c = get_from_list_by_index(self.parse_by_regex(LINKEDIN_COMPANY_LINKS_PATTERN), 0) self.domain.linkedin_g = get_from_list_by_index(self.parse_by_regex(LINKEDIN_GROUP_LINKS_PATTERN), 0) self.domain.linkedin_u = get_from_list_by_index(self.parse_by_regex(LINKEDIN_USER_LINKS_PATTERN), 0) self.__fetch_linkedin_c_data()
def check_gapps(name): if not isinstance(name, str): raise ValueError('Invalid name argument type.') url = 'https://www.google.com/a/{}/ServiceLogin'.format(name) try: data = fetch(url) except: return try: tree = html.fromstring(data) result = (tree.xpath('//title/text()') or (None,))[0] print(result) except: return if result == 'Sign in - Google Accounts': return True return False