Exemplo n.º 1
0
def create_influencer_from_bad_brands(brand, to_save=True):
    '''
    This method creates influencers from Brands whose domains contain blogger urls.
    Example:
        blogspot = Brands.objects.filter(domain_name__icontains='blogspot.")
        blogspot.update(blacklisted=True)
        for b in blogspot:
          create_influencer_from_bad_brands(b, True)


        Double checks:
            this function should be called only for those Brands that have not been passed through this function
            we shouldn't run this for brands with domain_name in 'tumblr.com', because these influencer could have
                a separate blog (say on blogspot.com) and then we will have duplicates

    '''
    with platformutils.OpRecorder(operation='import_from_bad_brand',
                                  brand=brand) as opr:
        url = brand.domain_name
        domain = utils.domain_from_url(url)
        if domain in BLACKLISTED_DOMAINS:
            log.info('Domain %r is blacklisted', domain)
            return
        inf = helpers.create_influencer_and_blog_platform(
            url,
            'discovered_from_brands',
            to_save,
            platform_name_fallback=True)
        if not inf:
            log.error('Blacklisted url: %r', url)
        if inf and inf.id is not None:
            opr.data = {'inf_id_created': [inf.id]}
        else:
            opr.data = {'inf_cnt_skipped': 1}
def campaign_posts_to_collections_batch_performer():
    """
    Populates the queue to add campaign posts to collections with tasks
    :return:
    """
    with platformutils.OpRecorder(
            operation='add_campaign_posts_to_collection') as opr:
        tasks_submitted = 0
        submission_tracker = TaskSubmissionTracker()

        bjp_ids = list(
            BrandJobPost.objects.exclude(archived=True).values_list('id',
                                                                    flat=True))

        for bjp_id in bjp_ids:
            add_campaign_posts_to_collection_task.apply_async(
                args=[
                    bjp_id,
                ], queue=CAMPAIGN_POSTS_TO_COLLECTIONS_QUEUE)

            submission_tracker.count_task(CAMPAIGN_POSTS_TO_COLLECTIONS_QUEUE)
            tasks_submitted += 1

        log.info('Tasks submitted: %s' % tasks_submitted)
        opr.data = {'tasks_submitted': tasks_submitted}
Exemplo n.º 3
0
def verify(influencer_id):
    influencer = models.Influencer.objects.get(id=int(influencer_id))
    with platformutils.OpRecorder(operation='verify',
                                  influencer=influencer) as opr:
        log.info('Verifying %r', influencer)
        fields = []
        info = {}
        for verifier in VERIFIERS:
            try:
                res = verifier.verify(influencer)
            except:
                log.exception('While running verifier %r', verifier)
                continue
            log.info('Result: %s', res)
            if res:
                fields.extend(res)
                info[verifier.__class__.__name__] = res
        log.info('Verified fields: %s', fields)
        log.info('Verification debug info: %s', info)
        old = json.loads(influencer.autoverified_fields or '[]')
        new = utils.unique_sameorder(old + fields)
        if old == new:
            log.info(
                'Verification process did not add new fields, old value: %s',
                old)
        else:
            log.info('New autoverified_fields: %s', new)
            influencer.autoverified_fields = json.dumps(new)
            influencer.save()
        opr.data = {'fields': fields, 'info': info}
Exemplo n.º 4
0
def estimate_if_fashion_blogger(influencer_id, to_save=True):
    influencer = models.Influencer.objects.get(id=int(influencer_id))
    posts = models.Posts.objects.filter(
        influencer=influencer,
        platform__platform_name__in=models.Platform.BLOG_PLATFORMS)
    with platformutils.OpRecorder('estimate_if_fashion_blogger',
                                  influencer=influencer) as opr:
        opr.data = {'posts_count': posts.count()}
        if posts.count() == 0:
            log.warn(
                'estimate_if_fashion_blogger didnt start for %r because it has no blog posts yet',
                influencer)
            opr.data = dict(opr.data, explanation='no_posts')
            return
        estimator = get_relevant_to_fashion_estimator()
        estimator.to_save = to_save
        res = estimator.estimate(influencer)
        opr.data = dict(opr.data,
                        explanation=utils.limit_lens(estimator.explanation,
                                                     10))
        log.info('Saved explanation:\n%s', pprint.pformat(opr.data))
        if to_save:
            influencer.append_validated_on(
                constants.ADMIN_TABLE_INFLUENCER_FASHION)
            influencer.save()
        return res
Exemplo n.º 5
0
def detect_dead_blog(influencer_id):
    inf = Influencer.objects.get(id=influencer_id)
    with platformutils.OpRecorder(operation='detect_dead_blog', influencer=inf) as opr:
        success = None
        data = {}
        try:
            r = requests.get(inf.blog_url, timeout=30)
        except:
            success = False
        else:
            success = r.status_code == 200
            data['status_code'] = r.status_code
        log.info('detect_dead_blog result for %r: %s', inf.blog_url, success)
        data['success'] = success

        previous_runs = inf.platformdataop_set.filter(operation='detect_dead_blog',
                                                      finished__isnull=False,
                                                      error_msg__isnull=True).order_by('-finished')\
                                                     [:DEAD_BLOG_TESTS_TO_BLACKLIST - 1]
        if len(previous_runs) < DEAD_BLOG_TESTS_TO_BLACKLIST - 1:
            log.info('Not enough previous ops to check if should be disabled')
        else:
            recent_successes = [success] + [json.loads(pdo.data_json)['success'] for pdo in previous_runs]
            if all(x == False for x in recent_successes):
                log.warn('%d consecetive failures, blacklisting %r', DEAD_BLOG_TESTS_TO_BLACKLIST, inf)
                inf.blacklisted = True
                inf.save()
                data['blacklisted'] = True

        opr.data = data
Exemplo n.º 6
0
def _handle_post_url(inf, to_save):
    path = urlparse.urlsplit(inf.blog_url).path.rstrip('/')
    if not path:
        return False
    try:
        dres = fetcher.try_detect_platform_name(inf.blog_url)
        if dres is None:
            return False
        platform_name, corrected_url = dres
        if platform_name is None:
            return False
        if platform_name not in ('Blogspot', 'Wordpress'):
            return False
        if to_save:
            with platformutils.OpRecorder(operation='dup_pair.handle_post_url',
                                          influencer=inf) as opr:
                orig_parsed = urlparse.urlsplit(inf.blog_url)
                new_parsed = orig_parsed._replace(path='')
                new_url = urlparse.urlunsplit(new_parsed)
                opr.data = {'orig_url': inf.blog_url, 'new_url': new_url}
                helpers.update_blog_url(inf, new_url)
                inf.handle_duplicates()
        return True
    except:
        log.exception('While handle_post_url')
    return False
Exemplo n.º 7
0
def extract_emails_from_platform(platform_id=None,
                                 platform_object=None,
                                 to_save=True,
                                 disable_cleanup=False):
    assert platform_id is not None or platform_object is not None
    pl = models.Platform.objects.get(id=int(platform_id)) \
        if platform_id is not None \
        else platform_object
    try:
        with platformutils.OpRecorder('extract_emails_from_platform',
                                      platform=pl) as opr:
            with xbrowsermod.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    disable_cleanup=disable_cleanup) as xb:
                found_emails = []

                ee1 = FromAboutPagesExtractor(xb, pl)
                ee1.update_influencers_email(to_save=to_save)
                found_emails += ee1.found_emails

                ee2 = FromCommonPostsExtractor(xb, pl)
                ee2.update_influencers_email(to_save=to_save)
                found_emails += ee2.found_emails

                found_emails = filter_emails(found_emails)
                opr.data = {'found_emails': found_emails}
                return found_emails
    except Exception as e:
        log.exception(e,
                      extra={
                          'platform_id': platform_id,
                          'to_save': to_save,
                          'disable_cleanup': disable_cleanup
                      })
Exemplo n.º 8
0
 def _prepare_test_influencer(self, op='created_for_testing'):
     infs = models.Influencer.objects.filter(relevant_to_fashion=True, show_on_search=False,
                                             source__isnull=False, classification='blog',
                                             blacklisted=False).\
         exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS).\
         exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).\
         order_by('-id')
     log.info('%d infs', infs.count())
     assert infs.exists()
     inf_to_recreate = infs[0]
     self.orig_inf = inf_to_recreate
     log.info('Recreating influencer %r', inf_to_recreate)
     orig_blog_url = inf_to_recreate.blog_url
     orig_source = inf_to_recreate.source
     self._disable_inf(inf_to_recreate)
     self.inf = helpers.create_influencer_and_blog_platform(
         orig_blog_url,
         orig_source,
         to_save=True,
         platform_name_fallback=True)
     assert self.inf is not None
     with platformutils.OpRecorder(operation=op,
                                   influencer=self.inf) as opr:
         opr.data = {'source_influencer_id': inf_to_recreate.id}
     log.info('New influencer for testing: %r', self.inf)
Exemplo n.º 9
0
def import_from_post_content(post_id, to_save=True):
    global _DOMAINS_OF_POPULAR_BRANDS

    if _DOMAINS_OF_POPULAR_BRANDS is None:
        log.info('Starting loading _DOMAINS_OF_POPULAR_BRANDS')
        popular_brands = models.Brands.objects.\
            filter(blacklisted=False).\
            filter(num_items_shelved__gte=5).\
            exclude(name='www').\
            annotate(num_products=Count('productmodel')).\
            order_by('-num_products')[:100]
        _DOMAINS_OF_POPULAR_BRANDS = [
            utils.domain_from_url(b.domain_name) for b in popular_brands
        ]
        log.info('Finished loading _DOMAINS_OF_POPULAR_BRANDS')

    post = models.Posts.objects.get(id=int(post_id))
    with platformutils.OpRecorder(operation='import_from_post_content',
                                  post=post) as opr:
        log.info('import_from_post_content for %r', post)
        _do_import_from_content(post.content,
                                opr,
                                to_save,
                                blacklisted_domains=BLACKLISTED_DOMAINS +
                                _DOMAINS_OF_POPULAR_BRANDS +
                                estimation.URL_FRAGMENTS_NO_RESOLVING +
                                estimation.URL_FRAGMENTS_REQUIRING_RESOLVING +
                                estimation.URL_FRAGMENTS_IN_IFRAMES)
Exemplo n.º 10
0
def detect_platform_lang(platform_id):
    platform = models.Platform.objects.get(id=int(platform_id))
    with platformutils.OpRecorder(operation='detect_platform_lang', platform=platform) as opr:
        posts = platform.posts_set.all()[:POSTS_TO_CHECK]
        if len(posts) < MIN_DETECTED_FACTOR * POSTS_TO_CHECK:
            log.warn('Not enough posts to check: %d', len(posts))
            return
        langs = []
        for p in posts:
            if not p.content:
                continue
            text = xutils.strip_html_tags(p.content)
            lang = detect_language(text)
            log.info('Lang %r detected from content %r', lang, text)
            langs.append(lang)
        log.info('All langs: %r', langs)
        valid_langs = [l for l in langs if l != 'UNKNOWN']
        if not valid_langs:
            log.warn('Cannot detect language for any post')
            return
        lang_counter = collections.Counter(valid_langs)
        most_common_lang, most_common_counter = lang_counter.most_common(1)[0]
        log.info('Most common lang: %r, count: %d', most_common_lang, most_common_counter)
        if most_common_counter >= len(posts) * MIN_DETECTED_FACTOR:
            log.info('Count is high enough to set content_lang')
            platform.content_lang = most_common_lang
            platform.save()
        else:
            log.warn('Count IS NOT high enough to set content_lang')
Exemplo n.º 11
0
def submit_daily_social_platform_update_tasks(submission_tracker):
    """
    Get the Gplus and Bloglovin platforms we need to fetch and submit tasks for them.

    We are not really fetching posts here -- just updating platform info.

    We select platforms that have never had their info fetched and ones for which we have
    not done so for over a month.
    """
    with platformutils.OpRecorder(operation='submit_daily_fetch_tasks') as opr:
        counter = TaskCounter()

        with submission_tracker.operation('gplus_fetch'):
            gplus_plats = debra.models.Platform.objects.all(
            ).gplus_update_pending()
            gplus_plats = _do_submit_daily_fetch_tasks(counter,
                                                       submission_tracker,
                                                       gplus_plats,
                                                       queue_type='every_day')
            log.info('Gplus: {}'.format(len(gplus_plats)))

        with submission_tracker.operation('bloglovin_fetch'):
            bloglovin_plats = debra.models.Platform.objects.all(
            ).bloglovin_update_pending()
            bloglovin_plats = _do_submit_daily_fetch_tasks(
                counter,
                submission_tracker,
                bloglovin_plats,
                queue_type='every_day')
            log.info('Bloglovin: {}'.format(len(bloglovin_plats)))

        all_plats = gplus_plats + bloglovin_plats
        opr.data = {'tasks_submitted': len(all_plats)}
Exemplo n.º 12
0
def update_url_if_redirected(plat_id, update=False):
    """
    If the platform.url gets redirected to a new one, we should update the platform.url

    :param plat_id: is the platform id
    :param update: if True, we update the platform object
    """
    plat = Platform.objects.get(id=plat_id)
    with platformutils.OpRecorder(operation='update_url_if_redirected', platform=plat) as opr:
        try:
            resp = requests.get(plat.url)

            if is_page_content_valid(resp.text):
                log.info('Page content is valid')
            else:
                log.info('Invalid page content for %r, removing ADMIN_TABLE_INFLUENCER_INFORMATIONS',
                         plat)
                plat.influencer.remove_from_validated_on(
                    constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS)
                plat.influencer.save()
                opr.data = {'res': 'invalid_page_content'}
                return

            if resp.status_code != 200:
                log.warn('HTTP status code is not 200 for platform %r', plat)
                opr.data = {'res': 'invalid_status_code'}
                return
            new_url = resp.url
            if new_url == plat.url:
                new_url = detect_user_level_redirect(plat.url)
            if new_url != plat.url and new_url.rstrip('/') != plat.url.rstrip('/'):

                print "\t\t\tNew url: %s, old_url: %s" % (new_url, plat.url)
                opr.data = {'res': 'detected_redirection', 'new_url': new_url, 'old_url': plat.url}
                InfluencerCheck.report(plat.influencer, plat, InfluencerCheck.CAUSE_URL_CHANGED, [],
                                       'Old url: %r, new url: %r' % (plat.url, new_url))
                if update and new_url:
                    old_url = plat.url
                    #plat.influencer.update_url_references(old_url, new_url)
                    plat.url = new_url
                    plat.validated_handle = None
                    if plat.platform_name_is_blog:
                        redetect_platform_name(plat, update)
                    plat.save(bypass_checks=True)
                    plat.handle_duplicates()

                    # Update blog_urls also
                    infs_to_update = list(Influencer.objects.filter(blog_url=old_url))
                    print "\t\t\tUpdating influencer's blog_url: %r" % infs_to_update
                    for inf in infs_to_update:
                        assert inf.blog_url == old_url and inf.blog_url
                        inf.blog_url = new_url
                        inf.save(bypass_checks=True)
                        inf.handle_duplicates()

        except:
            log.exception('While checking redirect for %r', plat)
            # re-raise exception so it can be registered by OpRecorder
            raise
Exemplo n.º 13
0
def scrape_platform_data(platform_id):
    platform = models.Platform.objects.get(id=int(platform_id))
    with platformutils.OpRecorder(operation='scrape_data', platform=platform):
        if platform.platform_name not in PLATFORM_NAME_TO_SCRAPING_FETCHER:
            log.error('No scraping fetcher for platform_name %r', platform.platform_name)
            raise Exception('No scraping fetcher for platform_name %r' % platform.platform_name)
        sf = PLATFORM_NAME_TO_SCRAPING_FETCHER[platform.platform_name](platform)
        sf.scrape()
Exemplo n.º 14
0
def submit_daily_fetch_tasks():
    with platformutils.OpRecorder(operation='submit_daily_fetch_tasks') as opr:
        counter = TaskCounter()
        submission_tracker = TaskSubmissionTracker()
        query = debra.models.Platform.objects.all().for_daily_fetching()
        plats = _do_submit_daily_fetch_tasks(counter, submission_tracker,
                                             query)
        opr.data = {'tasks_submitted': len(plats)}
Exemplo n.º 15
0
 def fetch_for_posts(cls, posts):
     try:
         with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb:
             for post in posts:
                 with platformutils.OpRecorder(operation='{0}_for_post'.format(cls.__name__.lower()), post=post):
                     fetcher = cls(xb, post)
                     yield fetcher.fetch_interactions()
     except Exception as e:
         log.exception(e, extra={'posts_len': len(posts)})
Exemplo n.º 16
0
def run_handle_duplicates_for_influencer(influencer_id):
    influencer = Influencer.objects.get(id=influencer_id)
    with platformutils.OpRecorder(operation='handle_inf_duplicates', influencer=influencer) as opr:
        dups = Influencer.find_duplicates(influencer.blog_url, influencer.id)
        if dups:
            log.info('Found %d duplicates, running handle_duplicates')
            influencer.handle_duplicates()
        else:
            log.info('No duplicates found')
Exemplo n.º 17
0
def fetch_pins_by_source(influencer_id):
    influencer = models.Influencer.objects.get(id=int(influencer_id))
    try:
        with platformutils.OpRecorder('fetch_pins_by_source',
                                      influencer=influencer):
            f = PinsBySourceFetcher(influencer)
            f.fetch()
    except Exception as e:
        log.exception(e, exc_info=1, extra={'influencer_id': influencer_id})
Exemplo n.º 18
0
def update_single_sponsorship(self, sponsorshipinfo_id):
    try:
        sp = debra.models.SponsorshipInfo.objects.get(id=sponsorshipinfo_id)
        with platformutils.OpRecorder(operation='update_single_sponsorship', post=sp.post) as opr:
            with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
                f = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[sp.widget_type](xb, sp.post)
                si = f.fetch_sponsorship(True)
                detect_sidebar_sponsorships(si)
    except SoftTimeLimitExceeded as exc:
        self.retry(exc=exc)
Exemplo n.º 19
0
def cleanup(influencer_id):
    SOCIAL_PLATFORMS = ['Facebook', 'Pinterest', 'Twitter', 'Instagram']
    influencer = Influencer.objects.get(id=influencer_id)
    with platformutils.OpRecorder('cleanup', influencer=influencer) as opr:
        for pname in SOCIAL_PLATFORMS:
            try:
                _do_cleanup(influencer, pname)
            except:
                log.exception('While _do_cleanup(%r, %r)', influencer, pname)
                pass
Exemplo n.º 20
0
def submit_recrawl_campaigns_tasks():
    """
    Task to fetch recent posts for campaign-involved influencers.
    :return:
    """
    with platformutils.OpRecorder(
            operation='submit_recrawl_campaigns_tasks') as opr:
        tasks_submitted = 0
        submission_tracker = TaskSubmissionTracker()

        # getting platforms for those influencers, blog and social.
        bjps = BrandJobPost.objects.exclude(archived=True)

        for bjp in bjps:

            # fetching influencers involved in campaigns and their autovalidated social and blog platforms
            # inf_ids = list(bjp.candidates.filter(campaign_stage=6).values_list('mailbox__influencer__id', flat=True))
            inf_ids = [
                iid for iid in list(
                    bjp.candidates.filter(campaign_stage__gte=3).values_list(
                        'mailbox__influencer__id', flat=True))
                if iid is not None
            ]
            for inf_id in inf_ids:
                try:
                    inf = Influencer.objects.get(id=inf_id)

                    try:
                        blog_platform_id = inf.blog_platform.id
                    except (AttributeError, TypeError):
                        blog_platform_id = None

                    platform_ids = list(
                        inf.platform_set.filter(autovalidated=True).exclude(
                            url_not_found=True).values_list('id', flat=True))

                    if blog_platform_id is not None:
                        platform_ids.insert(0, blog_platform_id)

                    for plat in Platform.objects.filter(id__in=platform_ids):
                        queue_name = '{}.{}'.format(
                            RECRAWL_CAMPAIGNS_QUEUE_PREFIX, plat.platform_name)

                        submission_tracker.count_task(queue_name)
                        fetch_platform_data.apply_async(args=[plat.id, None],
                                                        queue=queue_name)

                    tasks_submitted += Platform.objects.filter(
                        id__in=platform_ids).count()
                except Influencer.DoesNotExist:
                    pass

        log.info('Tasks submitted: %s' % tasks_submitted)
        opr.data = {'tasks_submitted': tasks_submitted}
Exemplo n.º 21
0
def visit_influencer(influencer_id, pdo_id):
    influencer = models.Influencer.objects.get(id=influencer_id)
    pdo = models.PlatformDataOp.objects.get(id=pdo_id)
    log.info('visit_influencer for %r', influencer)
    opr = platformutils.OpRecorder(_pdo=pdo)
    try:
        visit_url(influencer.blog_url)
    except:
        opr.register_exception()
    else:
        opr.register_success()
Exemplo n.º 22
0
def blacklist_platforms_with_fetch_errors():
    connection = db_util.connection_for_reading()
    cur = connection.cursor()
    cur.execute(SQL_PLATFORM_IDS_WITH_FETCH_ERRORS)
    log.info('%d plats to blacklist', cur.rowcount)
    for plat_id, in cur:
        plat = Platform.objects.get(id=plat_id)
        with platformutils.OpRecorder(operation='blacklist_platforms_with_fetch_errors',
                                      platform=plat) as opr:
            log.info('Blacklisting platform %r', plat)
            plat.url_not_found = True
            plat.save()
Exemplo n.º 23
0
def reset_url_not_found_for_validated_instagram_platforms_with_ANY_error():
    infs = Influencer.objects.filter(
        validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS
    ).exclude(blacklisted=True)
    plats = Platform.objects.filter(influencer__in=infs,
                                    platform_name='Instagram',
                                    url_not_found=True)
    for plat in plats:
        with platformutils.OpRecorder('reset_url_not_found', platform=plat):
            plat.url_not_found = None
            plat.save()
            print 'processed %r', plat
Exemplo n.º 24
0
def fetch_products_from_post(post_id, shelf_user_id):
    """
    1. We first figure out if this post is a sponsored post or not (using simple keyword matching)
    2. Next, we search for any widgets that the blogger has. If yes, we search for products inside them (sponsorshipfetcher.get_product_urls)
    3. Next, we search for product urls in the post content
    4. Now we iterate over all these product urls

    """
    log.debug("Fetching products from post_id %s" % post_id)
    post = Posts.objects.select_related('influencer', 'influencer__shelf_user',
                                        'platform').get(id=post_id)
    with platformutils.OpRecorder('fetch_products_from_post', post=post):
        _do_fetch_products_from_post(post, shelf_user_id)
Exemplo n.º 25
0
def tag_post(post_id):
    post = models.Posts.objects.get(id=int(post_id))
    with platformutils.OpRecorder(operation='tag_post', post=post) as opr:
        #assert not post.contenttag_set.exists(), 'Tags for this post were already computed'
        cts = tag_content(DEFAULT_TAGGERS, post.url, post.content, post=post)
        for ct in cts:
            if not models.ContentTagCount.objects.filter(
                    platform=post.platform, tag=ct.tag).exists():
                models.ContentTagCount.objects.create(platform=post.platform,
                                                      tag=ct.tag,
                                                      count=1)
            else:
                ctc = models.ContentTagCount.objects.filter(
                    platform=post.platform,
                    tag=ct.tag).update(count=F('count') + 1)
        return cts
Exemplo n.º 26
0
def tag_influencer(influencer_id, to_save=False):
    influencer = models.Influencer.objects.get(id=int(influencer_id))
    with platformutils.OpRecorder(operation='tag_influencer',
                                  influencer=influencer) as opr:
        fat = FilterAdjectivesTagger()
        if not influencer.blog_platform:
            log.error('No blog platform for %r', influencer)
            return None
        posts = influencer.blog_platform.\
            posts_set.order_by('-create_date')[:POSTS_FOR_INFLUENCER_TAGGING]
        urls_contents = [(p.url, p.content) for p in posts if p.content]
        tags = fat.discover_tags_from_fragments(urls_contents)
        tags = _include_parent_tag(tags)
        log.info('All tags: %s', tags)
        if to_save:
            save_content_tags(tags, influencer=influencer)
        return fat.debug_info
Exemplo n.º 27
0
def check_if_copyrightable_content(influencer_id):
    influencer = models.Influencer.objects.get(id=int(influencer_id))
    with platformutils.OpRecorder(operation='check_if_copyrightable_content',
                                  influencer=influencer):
        res = False
        r = requests.get(influencer.blog_url, timeout=20)
        text = xutils.strip_html_tags(r.text)
        if u'©' in text:
            res = True
        else:
            words = nltk.wordpunct_tokenize(text)
            if 'copyright' in words:
                res = True
        log.info('Copyrightable content for %r: %r', influencer, res)
        influencer.copyrightable_content = res
        influencer.save()
        return res
Exemplo n.º 28
0
def handle_influencer_demographics(inf, diff_only=False):
    with platformutils.OpRecorder('normalize_location', influencer=inf) as opr:
        if not inf.demographics_location:
            log.warn('No location to process')
            return
        loc = get_location_data(inf.demographics_location)

        log.info(u'Got location from {}: {}'.format(inf.demographics_location,
                                                    loc))
        if loc is None or not loc.address:
            log.warn(u'Location not geocoded: {}'.format(
                inf.demographics_location))
            return

        address_components = loc.raw['address_components']

        changed = False
        if loc.raw is not None:
            country = extract_address_component('country', address_components)
            state = extract_address_component('administrative_area_level_1',
                                              address_components)
            city = extract_address_component('locality', address_components)
            try:
                locality, created = models.DemographicsLocality.objects.get_or_create(
                    country=country, state=state, city=city)
            except models.DemographicsLocality.MultipleObjectsReturned:
                locality = models.DemographicsLocality.objects.filter(
                    country=country, state=state, city=city)[0]
                created = False
            changed = inf.demographics_locality != locality
            inf.demographics_locality = locality

        # changed = changed or inf.demographics_location_normalized != loc.address
        inf.demographics_location_normalized = loc.address

        if loc.latitude is not None:
            changed = changed or inf.demographics_location_lat != loc.latitude
            inf.demographics_location_lat = loc.latitude
        if loc.longitude is not None:
            changed = changed or inf.demographics_location_lon != loc.longitude
            inf.demographics_location_lon = loc.longitude
        if (diff_only and changed) or not diff_only:
            inf.save()
        return changed
Exemplo n.º 29
0
def search_for_sponsorship(self, post_id):
    res = []
    try:
        post = debra.models.Posts.objects.get(id=post_id)
        with platformutils.OpRecorder(operation='search_for_sponsorship', post=post) as opr:
            with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
                for f_cls in SPONSORSHIP_FETCHER_CLASSES:
                    f = f_cls(xb, post)
                    try:
                        fres = f.fetch_sponsorship(True)
                        if fres is not None:
                            res.append(fres)
                            detect_sidebar_sponsorships(fres)

                    except:
                        log.exception('While search_for_sponsorship')
    except SoftTimeLimitExceeded as exc:
        self.retry(exc=exc)
    return res
Exemplo n.º 30
0
def classify_model(brand_id=None, influencer_id=None):
    # we are processing only one id, either an influencer or a brand
    assert brand_id is not None or influencer_id is not None
    assert not (brand_id is not None and influencer_id is not None)
    opr_kwargs = {'operation': 'content_classification'}
    if brand_id is not None:
        m = models.Brands.objects.get(id=brand_id)
        url = 'http://%s' % m.domain_name
        opr_kwargs['brand'] = m
    else:
        m = models.Influencer.objects.get(id=influencer_id)
        url = m.blog_url
        opr_kwargs['influencer'] = m
    with platformutils.OpRecorder(**opr_kwargs) as opr:
        c = Classifier()
        res = c.classify(url)
        log.info('Classified object %r url %r as %r', m, url, res)
        opr.data = {'result': res}
        m.save_classification(res)