Пример #1
0
    def _contains_blog_keyword_canonical_page(self, target):
        """
        This variant of _contains_blog_keyword works with scriptless version of page if it is implemented by the site
        :param target:
        :return:
        """
        # we have <meta name="fragment" content="!">, loading and checking blog words in canonical version of the page
        if len(target.tree.xpath(
                "//meta[@name='fragment'][@content='!']")) > 0:
            log.info('Checking canonical page for tags')
            try:
                url_parsed = urlparse.urlparse(target.url)
                new_url = '%s://%s/?_escaped_fragment_' % (url_parsed.scheme,
                                                           url_parsed.netloc)
                try:
                    r = requests.get(new_url,
                                     timeout=10,
                                     headers=utils.browser_headers())
                except SSLError:
                    r = requests.get(new_url,
                                     timeout=10,
                                     headers=utils.browser_headers(),
                                     verify=False)

                content = r.content.lower()
                for kw in BLOG_KEYWORDS:
                    log.info('Checking %s' % kw)
                    if kw.lower() in content:
                        log.info('Found blog keyword %r in canonical page', kw)
                        return True
            except Exception as e:
                log.exception(e)

        return False
Пример #2
0
    def get_profile(self, profile_id):
        if not profile_id.isdigit() and not profile_id.startswith('+'):
            profile_id = '+' + profile_id

        url = self.profile_url.format(profile_id)
        try:
            response = requests.get(url, headers=browser_headers())
        except SSLError:
            # trying to fetch it with verify=True if we encounter SSLError
            response = requests.get(url,
                                    headers=browser_headers(),
                                    verify=False)
        return self.parse_profile(response.content.decode('utf-8'))
Пример #3
0
    def get_description(cls, url, xb=None):
        """
        Getting description field from Youtube. For now, we're just collecting links to other platforms so that
        we can validate if this url belongs to the blog.
        """
        # remove query params
        # e.g.: http://www.youtube.com/user/zoella280390?feature=mhee => http://www.youtube.com/user/zoella280390
        url = utils.remove_query_params(url)
        if url.endswith('/'):
            about_page = url + "about"
        else:
            about_page = url + "/about"

        res = set()
        try:
            r = requests.get(about_page, verify=False)
            tree = lxml.html.fromstring(r.content)
            social_links = tree.xpath(
                '//a[contains(@class,"about-channel-link")]/@href')
            for s in social_links:
                res.add(s)
        except SSLError:
            # encountered SSLError - retrying with verify=False
            r = requests.get(about_page,
                             headers=utils.browser_headers(),
                             verify=False)
            tree = lxml.html.fromstring(r.content)
            social_links = tree.xpath(
                '//a[contains(@class,"about-channel-link")]/@href')
            for s in social_links:
                res.add(s)

        return '\n'.join(res)
Пример #4
0
def _get_twitter_page(screen_name):
    twitter_url = 'https://twitter.com/%s' % screen_name
    r = requests.get(twitter_url, headers=utils.browser_headers())

    # Poor man's throttling. Just wait 2 seconds.
    time.sleep(2)
    return r.content
Пример #5
0
def fetch_social_url(url, timeout=10):
    kwargs = {'url': url, 'timeout': timeout, 'verify': False}

    # TODO: these two lines were commented out before
    if social_platform_name_from_url(None, url) != 'Facebook':
        kwargs['headers'] = utils.browser_headers()

    r = requests.get(**kwargs)
    return r
Пример #6
0
def fetch_title_simple(url):
    """This function uses a simpler algorithm in BeatifulSoup
    to avoid parsing errors in lxml.
    """
    r = requests.get(url,
                     timeout=20,
                     headers=utils.browser_headers(),
                     verify=False)
    soup = BeautifulSoup(r.text)
    return soup.title.string
Пример #7
0
 def _get(self):
     try:
         # setting verify=True to bypass SSL certificate validation for some blogs
         # http://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification
         r = requests.get(self.url,
                          timeout=feedparsing.FEED_FETCH_TIMEOUT,
                          headers=utils.browser_headers(),
                          verify=False)
         r.raise_for_status()
         self.content = r.content
         self.redirected_url = r.url
     except requests.exceptions.RequestException:
         log.exception('Error in feed resolution fetching %r', self.url)
Пример #8
0
    def fetch(self):
        attempts = 0
        while attempts < 3:
            try:
                try:
                    r = requests.get(self.url,
                                     timeout=10,
                                     headers=utils.browser_headers())
                except SSLError:
                    r = requests.get(self.url,
                                     timeout=10,
                                     headers=utils.browser_headers(),
                                     verify=False)

                self.content = r.content
                self.http_status_code = r.status_code
                self.http_headers = r.headers
                attempts = 9999
            except:
                log.exception(
                    'While fetching content for classification from: {}'.
                    format(self.url))
                attempts += 1
Пример #9
0
def scrape_pin_source(post_id):
    post = models.Posts.objects.get(id=int(post_id))
    r = requests.get(post.url, headers=utils.browser_headers())
    tree = lxml.html.fromstring(r.text)
    anchor_els = tree.xpath('//div[@class="sourceFlagWrapper"]/a')
    if not anchor_els:
        log.warn('No anchor els')
        return
    href = anchor_els[0].attrib.get('href')
    if not href:
        log.warn('No href')
        return
    post.pin_source = utils.remove_fragment(href)
    post.save()
    log.info('Saved pin source %r', post.pin_source)
Пример #10
0
def fetch_title(url=None, content=None):
    """This function must be given either an url, or downloaded content
    """
    if content is None:
        assert url is not None
        r = requests.get(url,
                         timeout=5,
                         headers=utils.browser_headers(),
                         verify=False)
        content = r.text
    tree = lxml.html.fromstring(content)
    title_els = tree.xpath('//title')
    if not title_els:
        return None
    title = (title_els[0].text or '').strip()
    if not title:
        return None
    return title
Пример #11
0
    def read(self):
        # TODO: raising a chardet unicode detection error on 404 feed responses
        # ValueError: Expected a bytes object, not a unicode object
        # e.g. http://www.hautemimi.com/feed/asdfasdfas
        try:
            # We encountered this kind of 'smart' feeds: http://feeds.feedblitz.com/freebiefindingmom
            # It will render html page when finds user-agent, otherwise it provides an xml Atom feed.
            # So trying to checking url's domain and then behave correspondingly

            # setting verify=True to bypass SSL certificate validation for some blogs
            # http://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification

            is_feedblitz = urlparse(self.url).netloc == 'feeds.feedblitz.com'

            r = requests.get(
                self.url,
                timeout=FEED_FETCH_TIMEOUT,
                headers=None if is_feedblitz else utils.browser_headers(),
                verify=False)

            # extra check if it was a redirect to feedblitz
            if not is_feedblitz and urlparse(
                    r.url).netloc == 'feeds.feedblitz.com':
                self.url = r.url
                r = requests.get(self.url,
                                 timeout=FEED_FETCH_TIMEOUT,
                                 verify=False)

            r.raise_for_status()
            self.content = r.content
            self.headers = r.headers
        except requests.RequestException:
            self.content = b''
            self.headers = {}

        return self.content
Пример #12
0
    def is_url_a_blog(self, url=None, profile=None):
        """
        Checks if url is a blog
        :param url:
        :return:
        """
        log.info('Checking if url is blog: %s' % url)

        # checking if these urls are real and working
        try:
            resp = requests.get(url=url,
                                headers=browser_headers(),
                                timeout=15,
                                verify=False)

            if resp.status_code < 400:
                # looks like this url is OK
                url_parsed = urlparse(resp.url)
                dmn = url_parsed.netloc

                if dmn.lower().endswith('.livejournal.com'):
                    # looks like we found a LiveJournal blog
                    return True, resp.url

                # normalizing domain for blogspot
                if 'blogspot.com' in dmn:
                    dmn = '%s.blogspot.com' % dmn.split('.blogspot.com')[0]

                if dmn.endswith('.blogspot.com') or dmn.endswith(
                        '.wordpress.com'):
                    # If we found some blogspot or wordpress url at this moment - considering it best hit
                    return True, url_parsed._replace(netloc=dmn).geturl()

                if '<!-- This is Squarespace. -->' in resp.content:
                    # looks like we found a Squarespace blog
                    return True, resp.url

                # checking if 'blog' is in root domain (2nd level domain)
                root_domain = dmn.split('.')[-2] if len(
                    dmn.split('.')) >= 2 else None
                if root_domain is not None and 'blog' in root_domain:
                    # high chances that this is a blog
                    return True, resp.url

                if profile is not None:

                    # if liketoknow hashtag appears in the profile's description, then it's a blogger for sure
                    desc = profile.get_description_from_api()
                    if '#liketoknow' in desc.lower():
                        return True, resp.url

                    classification = self.url_classifier.classify(url=resp.url)
                    if classification == 'blog':
                        # looks like our Classifier defined it as blog
                        return True, resp.url

                # TODO: Unreliable? May be use here some regexp */*blog*/* in the path (not for now)?
                # if 'blog' in dmn or '/blog/' in url_parsed.path:
                #     # Looks like it is some blog?
                #     best_result = url_parsed.geturl()
                #     result.add(dmn)
                #     break

                return None, None

            else:
                # removing urls going to domain not found
                log.info('url %s returned %s code, skipping it' %
                         (url, resp.status_code))
                return None, None

        except Exception as e:
            log.exception(e)
            return None, None
Пример #13
0
def _do_import_from_blogger_profile(blogger_profile_url, opr, to_save=True):
    log.info('Processing profile %r', blogger_profile_url)

    r = requests.get(blogger_profile_url,
                     headers=utils.browser_headers(),
                     proxies=get_proxy_config())

    blogurls_names = []

    if utils.domain_from_url(r.url) == 'plus.google.com':
        gplus_user_id = r.url.rstrip('/').split('/')[-1]
        gplus_user = requests.get(
            GOOGLE_PLUS_PEOPLE_TEMPLATE.format(user_id=gplus_user_id)).json()
        log.info('Got gplus data:\n%s', pprint.pformat(gplus_user))
        if not gplus_user.get('urls'):
            log.warn('No gplus urls')
            return
        blog_url = gplus_user['urls'][0]['value']
        name = gplus_user['displayName']
        log.info('Gplus url and name: %r %r', blog_url, name)
        blogurls_names.append((blog_url, name))
    else:
        tree = lxml.html.fromstring(r.content)

        name_els = tree.xpath('//div[@class="vcard"]//h1')
        if not name_els:
            log.warn('No name els')
            name = None
        else:
            name = name_els[0].text.strip()
            if not name:
                log.warn('Empty name')
        log.info('Blogger name: %r', name)

        blog_url_els = tree.xpath('//a[contains(@rel, "contributor-to")]')
        if not blog_url_els:
            log.warn('No blog url')
            utils.write_to_file('/tmp/last_no_blog.html', r.text)
            blog_url = None
            if r.text.strip().lower() == 'proxy authorization required':
                raise Exception('Proxy error')
        else:
            for el in blog_url_els:
                blog_url = el.attrib['href'].strip()
                log.info('Blog url: %r', blog_url)
                blogurls_names.append((blog_url, name))
        if ALSO_CRAWL_OTHER_BLOGS_FOLLOWED:
            observed_els = tree.xpath('//li[@class="sidebar-item"]/a')
            for el in observed_els:
                blogurls_names.append((el.attrib.get('href'), None))

    log.info('Collected blogurls_names: %r', blogurls_names)
    data = {'inf_id_existing': [], 'inf_id_created': []}
    for blog_url, name in blogurls_names:
        if not blog_url:
            continue
        blog_pl_name = fetcher.create_platforms_from_urls(
            [blog_url], True)[0].platform_name

        dup_infs = models.Influencer.find_duplicates(blog_url,
                                                     exclude_blacklisted=False)
        if helpers.all_blacklisted(dup_infs):
            log.error(
                'All duplicate influencers blacklisted for url %r, not importing',
                blog_url)
            continue
        if dup_infs:
            inf = helpers.select_valid_influencer(dup_infs)
            log.warn('Existing inf found: %r', inf)
            data['inf_id_existing'].append(inf.id)
        else:
            inf = models.Influencer(blog_url=blog_url,
                                    name=name,
                                    source='comments_import')
            log.info('Created new influencer %r', inf)
            data['inf_id_created'].append(inf.id)
            if to_save:
                inf.save()

        blog_pl_dups = models.Platform.find_duplicates(inf, blog_url,
                                                       blog_pl_name)
        if blog_pl_dups:
            log.warn('Blog platform with url %r is already inserted: %r',
                     blog_url, blog_pl_dups)
            continue

        blog_pl = models.Platform(platform_name=blog_pl_name,
                                  url=blog_url,
                                  influencer=inf)
        log.info('Created new platform %r', blog_pl)
        if to_save:
            blog_pl.save()
    opr.data = data
    time.sleep(SLEEP_AFTER_PROCESSING_BLOGGER)
Пример #14
0
    def create_profile(self,
                       url=None,
                       tag=None,
                       category=None,
                       pipeline_class=None,
                       **kwargs):
        """
        Creating profile by Instagram post url or raw content object (in future, if needed)
        """
        def append_hashtags_mentions_commentors_to_description(
                profile, hashtags, mentions, commentors):
            if not hashtags:
                hashtags = []
            if not mentions:
                mentions = []
            if not commentors:
                commentors = []
            for h in hashtags:
                log.info("checking hashtags %r" % h)
                if not profile.profile_description:
                    # print("Adding hashtag from own comment: %r" % h)
                    profile.profile_description = h
                if profile.profile_description and not h in profile.profile_description:
                    # print("Adding hashtag from own comment: %r" % h)
                    profile.profile_description += ' ' + h
            for m in mentions:
                log.info("checking mentions %r" % m)
                if not profile.profile_description:
                    # print("Adding mentions from own comment: %r" % m)
                    profile.profile_description = '@' + m
                if profile.profile_description and not '@' + m in profile.profile_description:
                    # print("Adding mentions from own comment: %r" % m)
                    profile.profile_description += ' @' + m
            # save commentors as well with !*_<username> type
            for c in commentors:
                log.info("checking commentor %r" % c)
                if not profile.profile_description:
                    # print("Adding commentor: %r" % c)
                    profile.profile_description = '!*_' + c
                if profile.profile_description and not '!*_' + c in profile.profile_description:
                    # print("Adding commentor: %r" % c)
                    profile.profile_description += ' !*_' + c

            profile.save()

        def find_hashtags_mentions_commentors_in_comments(
                post_username, post_user_id, comments):
            hashtags = []
            mentions = []
            commentors = set()
            for j in comments:
                m = j['user']['id']
                commentor = j['user']['username']
                commentors.add(commentor)
                content = ''
                if post_user_id == m:
                    content += j['text']
                if len(content) > 0:
                    print("Content = %r" % content)
                    if xutils.is_html(content):
                        cleaned_content = xutils.strip_html_tags(content)
                        print("Needed to clean it up, it's now: %r" %
                              cleaned_content)
                    else:
                        cleaned_content = content
                    set1 = find_hashtags(cleaned_content)
                    set2 = find_mentions(cleaned_content)
                    if set1:
                        hashtags.extend(set1)
                    if set2:
                        mentions.extend(set2)
            return hashtags, mentions, commentors

        log.info("Scraping url: %s" % url)

        # getting category from kwargs, getting tag from kwargs, otherwise detecting it from url
        # category = kwargs.get('category', None)
        # tag = kwargs.get('tag', None)
        if not tag:
            # try to get tag from the url
            if 'tagged' in url:
                loc = url.find('tagged=') + len('tagged=')
                tag = url[loc:]
                log.info("No tag given, but found tag = %s from url %s" %
                         (tag, url))

        # getting page's content
        r = requests.get(url, headers=utils.browser_headers())

        # Poor man's throttling. Just wait 2 seconds.
        time.sleep(2)

        # TODO: need some check of requests result

        # getting instagram data, post's mention(?) and creator's id
        soup = BeautifulSoup(r.content)
        instagram_data = self.__extract_instagram_data(soup)

        owner_data = instagram_data.get('PostPage')[0].get('media').get(
            'owner')
        mention = owner_data.get('username')
        post_creator_id = owner_data.get('id')

        log.info("In %r found mention: %s and tag: %r and category: %s" %
                 (url, mention, tag, category))

        # creating pending profile using mention, tag and category -
        res, created = self.create_pending_profile(mention, tag)

        log.info('PROFILE_CHECK_01 created=%s id=%s date_created=%s' %
                 (created, res.id, res.date_created))

        if res.friends_count and res.friends_count < MINIMUM_FRIENDS_COUNT:
            log.info(
                "Small number of followers %d (lesser than %s) for %s, so returning"
                % (res.friends_count, MINIMUM_FRIENDS_COUNT, res.username))
            return res, None

        # get hashtags & mentions from captions
        caption = instagram_data.get('PostPage')[0].get('media').get(
            'caption', None)
        hashtags_in_caption = find_hashtags(caption)
        mentions_in_caption = find_mentions(caption)
        append_hashtags_mentions_commentors_to_description(
            res, hashtags_in_caption, mentions_in_caption, None)

        # get hashtags & mentions from comments made by the author herself (very common)
        comments = instagram_data.get('PostPage')[0].get('media').get(
            'comments').get('nodes')
        hashtags_in_comments, mention_in_comments, commentors = find_hashtags_mentions_commentors_in_comments(
            mention, post_creator_id, comments)
        append_hashtags_mentions_commentors_to_description(
            res, hashtags_in_comments, mention_in_comments, commentors)

        if category:
            if res.tags and category in res.tags:
                log.info(
                    "Category %r already exists in %r, let's not do more analysis"
                    % (category, res))
                return res, commentors
            #save the tag as well as the hashtag
            res.append_tag(category)
            if tag:
                append_hashtags_mentions_commentors_to_description(
                    res, [tag], [], [])

        MentionInPost.objects.filter(
            platform_name='Instagram',
            mention=mention).update(influencer_imported=True)

        # If this profile was freshly-created and has required prerequisites (has desired number of followers, etc),
        # its hashtags and mentions were set, pipeline_class provided,
        # then we initiate Pipeline performance of this profile.
        if created and pipeline_class is not None:
            try:
                # adding pipeline tag for profile to know from which pipeline it came
                res.append_tag('PIPELINE_%s' % pipeline_class)

                # getting a 'pipeline' by its name
                log.info('Loading pipeline %s for profile %s' %
                         (pipeline_class, res.id))
                pipeline_cls = locate('social_discovery.pipelines.%s' %
                                      pipeline_class)

                # creating an 'objekt' of the class
                pipeline = pipeline_cls()

                log.info('Running pipeline %s for profile %s' %
                         (pipeline_class, res.id))
                # calling the required function with appropriate params
                pipeline.run_pipeline(res.id)
            except KeyError:
                log.error('Pipeline %s not found' % pipeline_class)

        log.info('PROFILE_CHECK_02 created=%s id=%s date_created=%s' %
                 (created, res.id, res.date_created))
        return res, commentors
Пример #15
0
def get_youtube_channel_for_url(url=None):
    """
    Returns youtube channel url by its video url if it is valid youtube video url.

    Channel urls are like:
    https://www.youtube.com/channel/UCIzI6LQzuudmdbtXhuZpBmA/videos
    http://www.youtube.com/user/raechelmyers/videos
    https://www.youtube.com/c/Minimalistbaker

    Video urls are like:
    https://www.youtube.com/watch?v=Fky6hpTlBZU
    https://youtu.be/Fky6hpTlBZU
    http://y2u.be/Fky6hpTlBZU

    :param url:
    :return:
    """

    good_video_urls_regexp = r'(?:.*youtube.com\/watch\?.+|.*youtu.be\/.+|.*y2u.be\/.+)'
    good_channel_urls_regexp = r'(?:.*youtube.com\/channel\/.+|.*youtube.com\/user\/.+|.*youtube.com\/c\/.+)'

    if url is None:
        return None

    elif re.match(good_channel_urls_regexp, url):
        # it is already a channel url
        return url

    elif re.match(good_video_urls_regexp, url):
        # it is a video url, fetching channel's url with XBrowser
        # it is a video url, fetching channel's url with XBrowser
        import requests
        import lxml.html
        # need headers={...}/verify=False, otherwise it generates SSLError:
        # bad handshake: Error([('SSL routines', 'SSL3_GET_SERVER_CERTIFICATE', 'certificate verify failed')],)
        # and will not return a resultative response.
        r = requests.get(url, headers=utils.browser_headers(), verify=False)
        tree = lxml.html.fromstring(r.content)
        elems = tree.xpath("//div[@class='yt-user-info']/a")
        if elems and len(elems) > 0:
            elem = elems[0]
            v = elem.attrib.get('href')
            if v:
                channel_url = "https://www.youtube.com" + v
                return channel_url
        return None

        # with xbrowser.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
        #                        load_no_images=True, timeout=10) as xb:
        #
        #     # setting timeouts to xb instance
        #     xb.driver.set_script_timeout(5)
        #     xb.driver.implicitly_wait(5)
        #
        #     xb.driver.get(url)
        #     module_time.sleep(2)
        #
        #     channel_node = xb.driver.find_element_by_xpath("//div[@class='yt-user-info']/a")
        #     channel_node.click()
        #     module_time.sleep(2)
        #     channel_url = xb.driver.current_url
        #     return channel_url

    else:
        return None