예제 #1
0
def create_single_platform_from_url(url,
                                    use_api=False,
                                    platform_name_fallback=False):
    """
    :return: a :class:`debra.models.Platform` instance (not saved)
    :param url: an url for which to create a platform
    :param use_api: if to use api calls to detect platform_name (can result in an exception)
    :param platform_name_fallback: if platform_name cannot be detected, this tells if a platform with platform_name ``None`` should be created.
    :raises UnknownPlatformName: when ``platform_name_fallback == False`` and platform_name could not be detected
    :raises FetcherException: when ``platform_name_fallback == False`` and ``use_api == True`` and there was a fetcher error during platform name detection
    :raises requests.RequestException: when url resolving fails
    """

    # This can result in an exception, so we'are skipping this
    # url = utils.resolve_http_redirect(url)

    social_pl_name = platformutils.social_platform_name_from_url(None, url)
    if social_pl_name != platformutils.PLATFORM_NAME_DEFAULT:
        return models.Platform(platform_name=social_pl_name, url=url)
    handle = platformutils.url_to_handle(url)
    if handle.endswith('blogspot.com'):
        return models.Platform(platform_name='Blogspot', url=url)
    if handle.endswith('wordpress.com'):
        return models.Platform(platform_name='Wordpress', url=url)
    if handle.endswith('tumblr.com'):
        return models.Platform(platform_name='Tumblr', url=url)

    # checking for Squarespace platform
    # Checking for 'This is Squarespace.' in page's content.
    try:
        r = requests.get(url=url, timeout=20, headers=REQUESTS_HEADERS)
        r.raise_for_status()
        if '<!-- This is Squarespace. -->' in r.content:
            return models.Platform(platform_name='Squarespace', url=url)
    except:
        pass

    if not use_api:
        if platform_name_fallback:
            return models.Platform(platform_name=None, url=url)
        raise UnknownPlatformName()
    assert use_api
    try:
        fc_name, _ = try_detect_platform_name(url)
        if fc_name:
            return models.Platform(platform_name=fc_name, url=url)
        if platform_name_fallback:
            return models.Platform(platform_name=None, url=url)
        raise UnknownPlatformName()
    except fetcherbase.FetcherException:
        if platform_name_fallback:
            return models.Platform(platform_name=None, url=url)
        raise
예제 #2
0
    def setUp(self):
        self.influencer = models.Influencer()
        self.influencer.calculate_activity_level = Mock()
        self.influencer.save = Mock()

        self.platform = models.Platform()
        self.platform.influencer = self.influencer
예제 #3
0
    def test_new(self):
        platform = models.Platform()
        platform = self.TestPlatform()
        platform.last_post_date = datetime.today() - timedelta(days=5)
        platform.insert_date = datetime.today() - timedelta(days=1)
        platform.calculate_activity_level()

        self.assertEqual(models.ActivityLevel.ACTIVE_NEW, platform.activity_level)
예제 #4
0
def influencer_platforms_from_url_fields(influencer):
    res = []
    for platform_name, field_name in models.Influencer.platform_name_to_field.items(
    ):
        url_str = getattr(influencer, field_name)
        if not url_str:
            continue
        for url in url_str.split():
            pl = models.Platform(influencer=influencer,
                                 platform_name=platform_name,
                                 url=url)
            res.append(pl)
    return res
예제 #5
0
    def get_platform():
        existing_platforms = models.Platform.objects.filter(
            url=platform_url, platform_name=platform_name)
        if existing_platforms.exists():
            log.info('Follower\'s platform found in DB: %r',
                     existing_platforms[0])
            return existing_platforms[0]
        log.info('No existing %s platforms found for url %r', platform_name,
                 platform_url)

        inf_q = models.Influencer.objects.filter(**influencer_search_kwargs)
        if inf_q.exists():
            inf = inf_q[0]
            log.info('Found existing influencer: %r', inf)
        else:
            inf, inf_created = models.Influencer.objects.get_or_create(
                **influencer_create_kwargs)
            if inf_created:
                log.info('Created new influencer: %r', inf)
                inf.source = 'followers'
                inf.date_created = datetime.datetime.now()
                inf.save()
            else:
                log.info(
                    'Found existing influencer using influencer_create_kwargs: %r',
                    inf)

        dup_platforms = models.Platform.find_duplicates(
            inf, platform_url, platform_name)
        if dup_platforms:
            log.info('Detected duplicated platforms: %r', dup_platforms)
            return dup_platforms[0]

        pl = models.Platform()
        pl.influencer = inf
        pl.platform_name = platform_name
        pl.url = platform_url
        pl.description = description
        pl.processing_state = models.Platform.PROCESSING_STATE_NEW_FOLLOWERS_PLATFORM
        pl.save()
        log.info('Created new platform: %r', pl)
        return pl
예제 #6
0
def create_platforms_from_urls(urls,
                               use_api=False,
                               platform_name_fallback=False):
    """
    Returns a list of platforms created using
    :func:`create_single_platform_from_url` for a list of ``urls``. Processing
    stops when an exception happens (only possible if ``platform_name_fallback == True``).
    Exceptions occuring during url resolving cause skipping errnous urls.
    If ``platform_name_fallback == False`` and ``UnknownPlatformName`` is raised
    by ``create_single_platform_from_url``, a "Custom" platform_name is used.
    """
    res = []
    for url in urls:
        try:
            plat = create_single_platform_from_url(url, use_api,
                                                   platform_name_fallback)
            res.append(plat)
        except requests.RequestException:
            log.exception('Exception during url resolving, skipping url %r',
                          url)
            continue
        except UnknownPlatformName:
            res.append(models.Platform(platform_name='Custom', url=url))
    return res
예제 #7
0
    def test_row(self, row):
        url = row['url']
        log.info('%s Processing', url)
        pl_candidates = models.Platform.objects.filter(
            url=url, platform_name='Blogspot')
        if not pl_candidates.exists():
            log.warn('%s No Platform', url)
            if self.with_posts_only:
                return
            pl = models.Platform()
            pl.platform_name = 'TestData'
            pl.url = row['url']
        else:
            pl = pl_candidates[0]

        if not pl.posts_set.exists():
            log.warn('No posts for %r', pl)
            if self.with_posts_only:
                return

        emailextractor.tlocal._latest_validated, emailextractor.tlocal._latest_not_validated = None, None
        try:
            extracted = self.extraction_fun(platform_object=pl,
                                            to_save=False,
                                            disable_cleanup=self.procs == 1)
        except:
            log.exception('%s During platform extraction, skipping this row',
                          url)
            self.error_urls.append(row['url'])
            return

        log.info('%s Extracted emails: %r', url, extracted)

        if emailextractor.tlocal._latest_validated is not None:
            for email in emailextractor.tlocal._latest_validated:
                self.validated[(pl.platform_name, reason)] += 1
        if emailextractor.tlocal._latest_not_validated is not None:
            for pl in emailextractor.tlocal._latest_not_validated:
                self.not_validated[pl.platform_name] += 1
                self.not_validated_pls.append((url, pl))

        valid = row.get('valid_emails', '').split()
        log.info('%s Valid emails: %r', url, valid)

        extracted = [e.lower() for e in extracted]
        valid = [e.lower() for e in valid]

        if (not valid) and (not extracted):
            log.info('%s *** No usernames from both sources for %s', url,
                     platform_name)
        elif set(valid) == set(extracted):
            log.warn('%s +++ Test passed', url)
            self.found += 1
        elif (not valid):
            log.warn('%s ??? Test unknown', url)
            self.unknown += 1
        elif valid and (not extracted):
            log.warn('%s --- Test fail', url)
            self.notfound += 1
        elif set(valid) != set(extracted):
            log.warn('%s !!! Test incorrect', url)
            self.incorrect += 1
예제 #8
0
def import_blogurlsraw_single(blogurlsraw_id, to_save=True):
    m = models.BlogUrlsRaw.objects.get(id=int(blogurlsraw_id))
    if not m.source:
        log.error('No source set for %r', m)
        return
    if not m.blog_url:
        log.error('No blog_url set for %r', m)
        return

    dup_infs = models.Influencer.find_duplicates(m.blog_url,
                                                 exclude_blacklisted=False)
    if helpers.all_blacklisted(dup_infs):
        log.error(
            'All duplicate influencers blacklisted for url %r, not importing',
            m.blog_url)
        return
    if dup_infs:
        inf = helpers.select_valid_influencer(dup_infs)
        log.warn('Existing inf found: %r', inf)
    else:
        inf = models.Influencer(blog_url=m.blog_url,
                                name=m.name,
                                source='blogurlsraw')
        if to_save:
            inf.save()

    blog_pls = fetcher.create_platforms_from_urls([m.blog_url], True)
    if blog_pls:
        blog_pl = blog_pls[0]
        blog_pl.influencer = inf
    else:
        log.warn(
            'Could not create blog platform from blog_url %r, using Custom platform_name',
            m.blog_url)
        blog_pl = models.Platform(platform_name='Custom',
                                  url=m.blog_url,
                                  influencer=inf)
    log.info('Blog platform from blog_url: %r', blog_pl)
    if to_save:
        # This handles duplicates
        # set appropriate state
        blog_pl.platform_state = models.Platform.PLATFORM_STATE_STARTED
        blog_pl.save()

    pl = models.Platform(url=m.url,
                         num_followers=m.num_followers,
                         description=m.description,
                         influencer=inf)
    if 'lookbook.nu' in m.source:
        pl.platform_name = 'Lookbook'
    elif 'fashiolista.com' in m.source:
        pl.platform_name = 'Fashiolista'
    else:
        assert False, 'unknown source %r' % m.source
    if to_save:
        # This handles duplicates
        pl.save()

    # site_url can contain additional social handle
    if m.site_url:
        site_pls = fetcher.create_platforms_from_urls([m.site_url], True)
        if site_pls:
            site_pl = site_pls[0]
            site_pl.influencer = inf
            if to_save:
                # This handles duplicates
                site_pl.save()

    m.have_been_processed = True
    if to_save:
        m.save()
예제 #9
0
def _do_import_from_content(content,
                            opr,
                            to_save,
                            blacklisted_domains=BLACKLISTED_DOMAINS):
    """
    This function creates new platforms from content provided by searching for urls
    (except those given in blacklisted_domains).

    Limitation: it works only for building new 'blog' platforms, and doesn't work for creating new social platforms
    """
    if not content:
        log.warn('No content, doing nothing')
        return
    urls = contentfiltering.find_all_urls(content)
    log.info('Found %d urls: %r', len(urls), urls)
    platforms = []
    for url in urls:
        log.info('Oring url: %r', url)
        try:
            url = utils.resolve_http_redirect(url)
        except:
            log.exception('While resolve_http_redirect, skipping')
            continue
        log.info('Redirected url: %r', url)
        vurl = platformutils.url_to_handle(url)
        if not vurl:
            log.info('No handle computed from url %r, skipping', url)
            continue
        domain = utils.domain_from_url(vurl)
        if domain in blacklisted_domains:
            log.info('Domain %r is blacklisted', domain)
            continue
        blog_url = utils.url_without_path(url)
        if domain.endswith('.wordpress.com'):
            platforms.append(
                models.Platform(platform_name='Wordpress', url=blog_url))
        elif domain.endswith('.blogspot.com'):
            platforms.append(
                models.Platform(platform_name='Blogspot', url=blog_url))
        else:
            content = xutils.fetch_url(blog_url)
            if content:
                discovered_pname = xutils.contains_blog_metatags(content)
                if discovered_pname:
                    platforms.append(
                        models.Platform(platform_name=discovered_pname,
                                        url=blog_url))
                    continue
            platforms.append(
                models.Platform(platform_name='Custom', url=blog_url))

    influencers = []
    influencers_created = []
    for plat in platforms:
        inf, inf_created = helpers.get_or_create_influencer(
            plat.url, 'comments_content_import', to_save)
        if not inf:
            log.warn(
                'Skipping url %r because influencer with this url is blacklisted',
                plat.url)
            continue
        plat.influencer = inf
        influencers.append(inf)
        if inf_created:
            influencers_created.append(inf)

    if opr:
        opr.data = {
            'influencer_ids': [influencer.id for influencer in influencers],
            'influencer_created_ids':
            [influencer.id for influencer in influencers_created],
            'influencer_blog_urls':
            [influencer.blog_url for influencer in influencers],
        }

    log.info('Platforms from content: %r', platforms)
    if to_save:
        for plat in platforms:
            # influencer of None means we got a blacklisted influencer
            # when we searched by URL.
            if plat.influencer is not None:
                plat.save()

    return platforms
예제 #10
0
def _do_import_from_blogger_profile(blogger_profile_url, opr, to_save=True):
    log.info('Processing profile %r', blogger_profile_url)

    r = requests.get(blogger_profile_url,
                     headers=utils.browser_headers(),
                     proxies=get_proxy_config())

    blogurls_names = []

    if utils.domain_from_url(r.url) == 'plus.google.com':
        gplus_user_id = r.url.rstrip('/').split('/')[-1]
        gplus_user = requests.get(
            GOOGLE_PLUS_PEOPLE_TEMPLATE.format(user_id=gplus_user_id)).json()
        log.info('Got gplus data:\n%s', pprint.pformat(gplus_user))
        if not gplus_user.get('urls'):
            log.warn('No gplus urls')
            return
        blog_url = gplus_user['urls'][0]['value']
        name = gplus_user['displayName']
        log.info('Gplus url and name: %r %r', blog_url, name)
        blogurls_names.append((blog_url, name))
    else:
        tree = lxml.html.fromstring(r.content)

        name_els = tree.xpath('//div[@class="vcard"]//h1')
        if not name_els:
            log.warn('No name els')
            name = None
        else:
            name = name_els[0].text.strip()
            if not name:
                log.warn('Empty name')
        log.info('Blogger name: %r', name)

        blog_url_els = tree.xpath('//a[contains(@rel, "contributor-to")]')
        if not blog_url_els:
            log.warn('No blog url')
            utils.write_to_file('/tmp/last_no_blog.html', r.text)
            blog_url = None
            if r.text.strip().lower() == 'proxy authorization required':
                raise Exception('Proxy error')
        else:
            for el in blog_url_els:
                blog_url = el.attrib['href'].strip()
                log.info('Blog url: %r', blog_url)
                blogurls_names.append((blog_url, name))
        if ALSO_CRAWL_OTHER_BLOGS_FOLLOWED:
            observed_els = tree.xpath('//li[@class="sidebar-item"]/a')
            for el in observed_els:
                blogurls_names.append((el.attrib.get('href'), None))

    log.info('Collected blogurls_names: %r', blogurls_names)
    data = {'inf_id_existing': [], 'inf_id_created': []}
    for blog_url, name in blogurls_names:
        if not blog_url:
            continue
        blog_pl_name = fetcher.create_platforms_from_urls(
            [blog_url], True)[0].platform_name

        dup_infs = models.Influencer.find_duplicates(blog_url,
                                                     exclude_blacklisted=False)
        if helpers.all_blacklisted(dup_infs):
            log.error(
                'All duplicate influencers blacklisted for url %r, not importing',
                blog_url)
            continue
        if dup_infs:
            inf = helpers.select_valid_influencer(dup_infs)
            log.warn('Existing inf found: %r', inf)
            data['inf_id_existing'].append(inf.id)
        else:
            inf = models.Influencer(blog_url=blog_url,
                                    name=name,
                                    source='comments_import')
            log.info('Created new influencer %r', inf)
            data['inf_id_created'].append(inf.id)
            if to_save:
                inf.save()

        blog_pl_dups = models.Platform.find_duplicates(inf, blog_url,
                                                       blog_pl_name)
        if blog_pl_dups:
            log.warn('Blog platform with url %r is already inserted: %r',
                     blog_url, blog_pl_dups)
            continue

        blog_pl = models.Platform(platform_name=blog_pl_name,
                                  url=blog_url,
                                  influencer=inf)
        log.info('Created new platform %r', blog_pl)
        if to_save:
            blog_pl.save()
    opr.data = data
    time.sleep(SLEEP_AFTER_PROCESSING_BLOGGER)
예제 #11
0
    def test_row(self, row):
        url = row['url']
        log.info('%s Processing', url)
        pl_candidates = models.Platform.objects.filter(url=url, platform_name='Blogspot')
        if not pl_candidates.exists():
            log.warn('%s No Platform', url)
            if self.with_posts_only:
                return
            pl = models.Platform()
            pl.platform_name = 'Outreach'
            pl.url = row['url']
        else:
            pl = pl_candidates[0]

        if pl.posts_set.count() < 5:
            log.warn('No posts for %r', pl)
            if self.with_posts_only:
                return

        platformextractor.tlocal._latest_validated, platformextractor.tlocal._latest_not_validated = None, None
        try:
            extracted = self.extraction_fun(platform_object=pl,
                                            to_save=False,
                                            disable_cleanup=self.procs == 1)
        except:
            log.exception('%s During platform extraction, skipping this row', url)
            self.error_urls.append(row['url'])
            return

        if platformextractor.tlocal._latest_validated is not None:
            for pl, reason in platformextractor.tlocal._latest_validated:
                self.validated[(pl.platform_name, reason)] += 1
        if platformextractor.tlocal._latest_not_validated is not None:
            for pl in platformextractor.tlocal._latest_not_validated:
                self.not_validated[pl.platform_name] += 1
                self.not_validated_pls.append((url, pl))

        log.info('%s Extracted platforms: %r', url, extracted)

        extracted_by_platform_name = defaultdict(list)
        for e in extracted:
            extracted_by_platform_name[e.platform_name].append(e)
        log.info('extracted_by_platform_name:\n%s', pformat(dict(extracted_by_platform_name)))
        for pname, pls in extracted_by_platform_name.items():
            if len(pls) != 1:
                log.warn('999 %r Multiple platforms for a single platform_name %s: %s', url, pname, pls)

        for platform_name in ['Facebook', 'Twitter', 'Pinterest', 'Bloglovin', 'Instagram']:

            from_spreadsheet = row[platform_name].strip()
            if from_spreadsheet:
                username_from_spreadsheet = platformextractor.username_from_platform_url(from_spreadsheet)
            else:
                username_from_spreadsheet = ''

            from_extracted = extracted_by_platform_name[platform_name][0] \
                if extracted_by_platform_name[platform_name] \
                else ''
            if from_extracted:
                username_extracted = platformextractor.username_from_platform_url(from_extracted.url)
            else:
                username_extracted = ''

            if username_from_spreadsheet:
                username_from_spreadsheet = username_from_spreadsheet.lower()
            if username_extracted:
                username_extracted = username_extracted.lower()

            if (not username_from_spreadsheet) and (not username_extracted):
                log.info('%s *** No usernames from both sources for %s', url, platform_name)
            elif username_from_spreadsheet == username_extracted:
                log.warn('%s +++ Test passed: %s: spreadsheet: %r, extracted: %r', url, platform_name,
                         username_from_spreadsheet, username_extracted)
                self.found[platform_name] += 1
            elif (not username_from_spreadsheet):
                log.warn('%s ??? Test unknown: %s: spreadsheet: %r, extracted: %r', url, platform_name,
                         username_from_spreadsheet, username_extracted)
                self.unknown[platform_name] += 1
            elif username_from_spreadsheet and (not username_extracted):
                log.warn('%s --- Test fail: %s: spreadsheet: %r, extracted: %r', url, platform_name,
                         username_from_spreadsheet, username_extracted)
                self.notfound[platform_name] += 1
            elif username_from_spreadsheet != username_extracted:
                log.warn('%s !!! Test incorrect: %s: spreadsheet: %r, extracted: %r', url, platform_name,
                         username_from_spreadsheet, username_extracted)
                self.incorrect[platform_name] += 1