Пример #1
0
    def __fetch_linkedin_c_data(self):

        if not self.domain.linkedin_c:
            return

        try:
            data = fetch(self.domain.linkedin_c, True)

        except:
            logger.debug('Can\'t fetch LinkedIn data for {}'.format(self.domain.linkedin_c))
            return

        try:
            tree = html.fromstring(data)

        except:
            logger.debug('Can\'t create LXML tree for {}'.format(self.domain.linkedin_c))
            return

        self.domain.linkedin_c_short_desc = get_from_list_by_index(tree.xpath(data, '//*[@id="body"]/div[2]/div[1]/div[1]/div/p[1]/text()'), 0)
        self.domain.linkedin_c_followers  = get_from_list_by_index(tree.xpath(data, '//*[@id="biz-follow-mod"]/div/div/p/strong/text()'), 0)
        self.domain.linkedin_c_spe        = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[2]/div[2]/div/p/text()'), 0)
        self.domain.linkedin_c_website    = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[1]/p/a/text()'), 0)
        self.domain.linkedin_c_industry   = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[2]/p/text()'), 0)
        self.domain.linkedin_c_address    = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[4]/p/span[1]/text()'), 0)
        self.domain.linkedin_c_comp_size  = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[1]/div[2]/ul/li[5]/p/text()'), 0)
        self.domain.linkedin_c_founded    = get_from_list_by_index(tree.xpath(data, '//*[@id="content"]/div[2]/div[2]/ul/li[6]/p/text()'), 0)
Пример #2
0
    def __init__(self, name, proxy_always=False, links_content=('about', 'contact', 'info'), emails_black_list=tuple(), *args, **kwargs):

        super().__init__(name)

        if not isinstance(links_content, tuple):
            raise TypeError('Invalid links content argument type. Can\'t create Cache Walker instance.')

        if not isinstance(emails_black_list, tuple):
            raise TypeError('Invalid emails black list argument type. Can\'t create Cache Walker instance.')

        self.proxy_always = proxy_always
        self.pages        = list()

        root_url = 'http://{}'.format(self.domain.get_name())

        try:
            home_page_data = fetch(root_url, self.proxy_always)

        except:
            logger.error('Can\'t fetch home page for \'{}\'. Domain is not active'.format(self.domain.get_name()))
            return

        self.domain.active = True

        self.pages.append(home_page_data)

        links = self.get_similar_links(home_page_data, root_url, links_content)

        for l in links:
            try:
                self.pages.append(fetch(l, self.proxy_always))

            except:
                logger.warning('Can\'t fetch data from {}.'.format(l))
                continue

        self.domain.gapps      = self.check_gapps(self.domain.get_name())
        self.domain.emails     = set(self.parse_by_regex('([\w\-]+@(?:(?!example)[\w\-]+\.)+[a-z]+)')) - set(emails_black_list)
        self.domain.facebook   = get_from_list_by_index(self.parse_by_regex(FACEBOOK_LINKS_PATTERN), 0)
        self.domain.twitter    = get_from_list_by_index(self.parse_by_regex(TWITTER_LINKS_PATTERN), 0)
        self.domain.linkedin_c = get_from_list_by_index(self.parse_by_regex(LINKEDIN_COMPANY_LINKS_PATTERN), 0)
        self.domain.linkedin_g = get_from_list_by_index(self.parse_by_regex(LINKEDIN_GROUP_LINKS_PATTERN), 0)
        self.domain.linkedin_u = get_from_list_by_index(self.parse_by_regex(LINKEDIN_USER_LINKS_PATTERN), 0)

        self.__fetch_linkedin_c_data()
Пример #3
0
    def check_gapps(name):

        if not isinstance(name, str):
            raise ValueError('Invalid name argument type.')

        url = 'https://www.google.com/a/{}/ServiceLogin'.format(name)

        try:
            data = fetch(url)

        except:
            return

        try:
            tree = html.fromstring(data)
            result = (tree.xpath('//title/text()') or (None,))[0]
            print(result)
        except:
            return

        if result == 'Sign in - Google Accounts':
            return True

        return False