Exemplo n.º 1
0
def discover_blogs(profile_id):
    with OpRecorder('twitter_crawl_discover_blogs'):
        profile = models.TwitterProfile.objects.get(pk=profile_id)
        blogs = find_blogs(profile)

        if not blogs:
            log.info('No blogs discovered for profile %s', profile)
            _no_blogs(profile)
            return

        if not profile_is_valid_influencer(profile):
            log.info('Profile %s not valid. Flagging and skipping...')
            _no_blogs(profile)
            return

        influencer = helpers.create_influencer_and_blog_platform_bunch(
            blogs, 'discovered_via_twitter', category=None)
        if len(influencer) >= 1:
            influencer = list(influencer)[0]

        if influencer and not influencer.tw_url or profile.screen_name not in influencer.tw_url:
            influencer.tw_url = 'https://twitter.com/%s' % profile.screen_name
            influencer.save()
            log.info("Updated influencer %d tw_url to '%s'", influencer.pk,
                     influencer.tw_url)

        if influencer:
            profile.discovered_influencer = influencer
            profile.valid_influencer = True
            profile.save()
Exemplo n.º 2
0
 def update_profile_details(cls, profile_id):
     with OpRecorder('instagram_crawl_profile_details'):
         profile = InstagramProfile.objects.get(pk=profile_id)
         log.info('Updating details for: %s, pending: %r', profile.username,
                  profile.update_pending)
         details = scrape_profile_details(profile)
         profile.update_from_web_data(details)
Exemplo n.º 3
0
def fetch_friends(screen_name, platform_id=None, cursor=-1, batch_size=200):
    with OpRecorder('twitter_crawl_fetch_friends'):
        cache = get_cache('default')
        rate_limit = cache.get('twitter_rate_limited_until')
        if rate_limit and rate_limit > datetime.utcnow():
            log.info('Still rate limited. Retrying later...')
            fetch_friends.retry(kwargs=dict(screen_name=screen_name,
                                            cursor=cursor,
                                            batch_size=batch_size),
                                countdown=15 * 60)
            return

        try:
            client = create_client()
            profile = get_profile(screen_name, platform_id)

            log.info(
                "Fetching twitter friends. screen_name: %s, platform_id: %s, cursor: %s",
                screen_name, platform_id, cursor)
            response = client.friends.list(screen_name=profile.screen_name,
                                           count=batch_size,
                                           cursor=cursor)
            next_cursor = response['next_cursor']
            friends_batch = response['users']

            save_friends(profile, friends_batch)

            if next_cursor == 0:
                log.info('Last friend batch for Twitter profile: %s',
                         profile.screen_name)
                profile.friends_updated = datetime.utcnow()
                profile.save()
            else:
                log.info(
                    'Queuing next friend batch for Twitter profile: %s, cursor: %s',
                    profile.screen_name, next_cursor)
                queue_fetch_task(fetch_friends,
                                 screen_name=screen_name,
                                 platform_id=platform_id,
                                 cursor=next_cursor,
                                 batch_size=batch_size)
        except twitter.TwitterError:
            log.info(
                'Got a Twitter fetch error. Setting rate limit flag and retrying...'
            )
            cache.set('twitter_rate_limited_until',
                      datetime.utcnow() + timedelta(minutes=15))
            fetch_friends.retry(kwargs=dict(screen_name=screen_name,
                                            cursor=cursor,
                                            batch_size=batch_size),
                                countdown=15 * 60)
Exemplo n.º 4
0
def import_from_mention(mention_id):
    with OpRecorder('twitter_crawl_import_from_mention'):
        m = dmodels.MentionInPost.objects.get(pk=mention_id)
        if m.influencer_imported:
            log.info("Already imported mention %d for %s", m.pk, m.mention)
            return

        screen_name = m.mention.strip().lower()
        create_pending_profile(screen_name)

        dmodels.MentionInPost.objects.filter(
            platform_name='Twitter',
            mention=m.mention,
            influencer_imported=False).update(influencer_imported=True)
Exemplo n.º 5
0
def update_profile_details(profile_id):
    with OpRecorder('twitter_crawl_update_profile_details'):
        profile = models.TwitterProfile.objects.get(pk=profile_id)
        log.info('Updating details for: %s, pending: %r', profile.screen_name,
                 profile.update_pending)
        details = scrape_profile_details(profile.screen_name)

        profile.friends_count = details['following']
        profile.followers_count = details['followers']
        profile.post_count = details['tweets']
        profile.profile_description = details['description_html']
        profile.last_post_time = details['last_post_time']
        profile.api_data = details

        profile.update_pending = False
        profile.save()
Exemplo n.º 6
0
    def create_new_profiles(self,
                            hashtags=None,
                            submission_tracker=None,
                            num_pages_to_load=20,
                            pipeline_class=None,
                            **kwargs):
        """
        Iterates over a list of hashtags by mask https://instagram.com/explore/tags/<hashtag>/
        Issues a task to perform

        Note: 'hashtags' should be a dict with categories and tags like:
        {'singapore': ['oo7d', 'anothertag', 'onemoretag', ...], ...}

        """
        if type(hashtags) != dict:
            log.error(
                'hashtags parameter should be a dict of categories and lists '
                'of their corresponding hashtags, not a %s' % type(hashtags))
            return None

        log.info('Issuing tasks to obtain profiles for hashtags: %s' %
                 hashtags)
        # print('hashtags: %s   num_pages: %s' % (hashtags, num_pages_to_load))

        with OpRecorder('instagram_crawl_scrape_instagram_feeds'):
            categories = hashtags.keys()
            for cat in categories:
                tags = hashtags[cat]
                for tag in tags:
                    crawler_task.apply_async(
                        kwargs={
                            'klass_name': 'CreatorByInstagramHashtags',
                            'task_type': 'perform_feed',
                            'tag': tag,
                            'num_pages': num_pages_to_load,
                            'category': cat,
                            'pipeline_class': pipeline_class
                        },
                        queue=
                        'instagram_feed_scraper'  # Queue where tasks to perform separate feeds are put
                    )

                    if submission_tracker is not None:
                        submission_tracker.count_task(
                            'crawlers.scrape_instagram_feed_for_tag')
Exemplo n.º 7
0
def discover_from_google(bio_search, page=0, max_pages=1000):
    with OpRecorder('twitter_crawl_discover_from_google'):
        log.info('Discovering for bio search %r, page: %d, max_pages: %d',
                 bio_search, page, max_pages)
        discovered_screen_names = google_search.get_twitter_profiles_with_bio(
            bio_search, page=page)

        for screen_name in discovered_screen_names:
            create_pending_profile(screen_name)

        if page < max_pages - 1:
            discover_from_google.apply_async(
                kwargs=dict(bio_search=bio_search,
                            page=page + 1,
                            max_pages=max_pages),
                queue='twitter_discover_from_google',
                routing_key='twitter_discover_from_google',
            )
Exemplo n.º 8
0
def fetch_alexa_data(pl_id):
    pl = Platform.objects.get(id=pl_id)
    log.info('Fetching alexa data for platform %r', pl)
    with OpRecorder('fetch_alexa_data', platform=pl) as opr:
        alexa = AlexaAPIWapper(pl)
        alexa.fetch()
Exemplo n.º 9
0
    def perform_feed(self,
                     tag,
                     num_pages,
                     category,
                     pipeline_class=None,
                     **kwargs):
        """
        This scrapes the instagram tags page for a given tag
        blog_discovery.hashtags[category] = {list of tags}.
        """
        with OpRecorder('instagram_crawl_feed_for_tag'):
            from xpathscraper import xbrowser
            from django.conf import settings
            page_count = 0
            image_urls = set()
            old_image_urls_count = 0
            log.info("Starting scraping for tag %r" % tag)
            with xbrowser.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True) as xb:
                url = 'https://instagram.com/explore/tags/%s/' % tag
                xb.load_url(url)
                time.sleep(2)

                # checking the number of posts if it is already in cache
                posts_qty = None
                posts_qty_nodes = xb.driver.find_elements_by_xpath(
                    '//header/span/span[@class]')
                if len(posts_qty_nodes) > 0:
                    try:
                        posts_qty = posts_qty_nodes[0].text
                        posts_qty = int(posts_qty.strip().replace(',', ''))
                        cached_posts_qty = cache.get('instagram_tag__%s' % tag)
                        if cached_posts_qty is not None and (
                                posts_qty - int(cached_posts_qty)) <= 100:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, '
                                'too few new posts - skipping this feed.' %
                                (cached_posts_qty, posts_qty))
                            return
                        else:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, performing this feed.'
                                % (cached_posts_qty, posts_qty))
                    except ValueError:
                        log.error(
                            'Could not parse posts quantity to number: %s, please check format'
                            % posts_qty)
                else:
                    log.info(
                        'No posts quantity node detected, possible Instagram page HTML structure changed.'
                    )

                # scroll to the bottom before we can find the 'load more pages' button
                xb.driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                while page_count < num_pages:
                    # find all images on the page so far and add them to our set
                    try:
                        # images = xb.driver.find_elements_by_xpath('//div[contains(@class, "PostsGrid__root")]//a')
                        # Instagram structure changed
                        images = xb.driver.find_elements_by_xpath(
                            '//article//a')
                    except:
                        page_count = num_pages
                        continue
                    all_image_urls = set()
                    for i in images:
                        all_image_urls.add(i.get_attribute('href'))

                    new_image_urls = all_image_urls - image_urls
                    image_urls = all_image_urls
                    if len(image_urls) == old_image_urls_count:
                        page_count = num_pages
                        continue
                    old_image_urls_count = len(image_urls)

                    print(
                        "new images: %d so far we have %d image urls for tag %r"
                        % (len(new_image_urls), len(image_urls), tag))
                    for i in new_image_urls:
                        try:
                            crawler_task.apply_async(
                                kwargs={
                                    'klass_name': 'CreatorByInstagramHashtags',
                                    'task_type': 'create_profile',
                                    'url': i,
                                    'tag': tag,
                                    'category': category,
                                    'pipeline_class': pipeline_class
                                },
                                # Queue where tasks to create new profiles for separate posts in feed are put
                                queue='scrape_instagram_posts_new',
                            )
                        except:
                            print("some error for %s" % i)
                            pass
                    # find the next page button
                    # el = xb.driver.find_elements_by_xpath('//div[contains(@class, "moreLoadingIndicator")]//a')
                    el = xb.driver.find_elements_by_xpath(
                        '//a[contains(text(), "Load more")]')

                    if page_count == 0 and len(el) > 0:
                        e = el[0]
                        e.click()
                        log.info(
                            "Found next page button for page %s successfully, clicking and waiting."
                            % page_count)

                    else:
                        log.info(
                            "'Load More Pics' button not found... returning.")
                        #page_count = num_pages
                        # scroll to the bottom before we can find the 'load more pages' button
                        xb.driver.execute_script("window.scrollTo(0, 50);")
                        xb.driver.execute_script(
                            "window.scrollTo(0, 1000000);")
                    time.sleep(3)
                    page_count += 1

                # caching post quantity for this tag
                if tag is not None and isinstance(posts_qty, int):
                    cache.set('instagram_tag__%s' % tag, posts_qty)