def create_influencer_from_bad_brands(brand, to_save=True): ''' This method creates influencers from Brands whose domains contain blogger urls. Example: blogspot = Brands.objects.filter(domain_name__icontains='blogspot.") blogspot.update(blacklisted=True) for b in blogspot: create_influencer_from_bad_brands(b, True) Double checks: this function should be called only for those Brands that have not been passed through this function we shouldn't run this for brands with domain_name in 'tumblr.com', because these influencer could have a separate blog (say on blogspot.com) and then we will have duplicates ''' with platformutils.OpRecorder(operation='import_from_bad_brand', brand=brand) as opr: url = brand.domain_name domain = utils.domain_from_url(url) if domain in BLACKLISTED_DOMAINS: log.info('Domain %r is blacklisted', domain) return inf = helpers.create_influencer_and_blog_platform( url, 'discovered_from_brands', to_save, platform_name_fallback=True) if not inf: log.error('Blacklisted url: %r', url) if inf and inf.id is not None: opr.data = {'inf_id_created': [inf.id]} else: opr.data = {'inf_cnt_skipped': 1}
def campaign_posts_to_collections_batch_performer(): """ Populates the queue to add campaign posts to collections with tasks :return: """ with platformutils.OpRecorder( operation='add_campaign_posts_to_collection') as opr: tasks_submitted = 0 submission_tracker = TaskSubmissionTracker() bjp_ids = list( BrandJobPost.objects.exclude(archived=True).values_list('id', flat=True)) for bjp_id in bjp_ids: add_campaign_posts_to_collection_task.apply_async( args=[ bjp_id, ], queue=CAMPAIGN_POSTS_TO_COLLECTIONS_QUEUE) submission_tracker.count_task(CAMPAIGN_POSTS_TO_COLLECTIONS_QUEUE) tasks_submitted += 1 log.info('Tasks submitted: %s' % tasks_submitted) opr.data = {'tasks_submitted': tasks_submitted}
def verify(influencer_id): influencer = models.Influencer.objects.get(id=int(influencer_id)) with platformutils.OpRecorder(operation='verify', influencer=influencer) as opr: log.info('Verifying %r', influencer) fields = [] info = {} for verifier in VERIFIERS: try: res = verifier.verify(influencer) except: log.exception('While running verifier %r', verifier) continue log.info('Result: %s', res) if res: fields.extend(res) info[verifier.__class__.__name__] = res log.info('Verified fields: %s', fields) log.info('Verification debug info: %s', info) old = json.loads(influencer.autoverified_fields or '[]') new = utils.unique_sameorder(old + fields) if old == new: log.info( 'Verification process did not add new fields, old value: %s', old) else: log.info('New autoverified_fields: %s', new) influencer.autoverified_fields = json.dumps(new) influencer.save() opr.data = {'fields': fields, 'info': info}
def estimate_if_fashion_blogger(influencer_id, to_save=True): influencer = models.Influencer.objects.get(id=int(influencer_id)) posts = models.Posts.objects.filter( influencer=influencer, platform__platform_name__in=models.Platform.BLOG_PLATFORMS) with platformutils.OpRecorder('estimate_if_fashion_blogger', influencer=influencer) as opr: opr.data = {'posts_count': posts.count()} if posts.count() == 0: log.warn( 'estimate_if_fashion_blogger didnt start for %r because it has no blog posts yet', influencer) opr.data = dict(opr.data, explanation='no_posts') return estimator = get_relevant_to_fashion_estimator() estimator.to_save = to_save res = estimator.estimate(influencer) opr.data = dict(opr.data, explanation=utils.limit_lens(estimator.explanation, 10)) log.info('Saved explanation:\n%s', pprint.pformat(opr.data)) if to_save: influencer.append_validated_on( constants.ADMIN_TABLE_INFLUENCER_FASHION) influencer.save() return res
def detect_dead_blog(influencer_id): inf = Influencer.objects.get(id=influencer_id) with platformutils.OpRecorder(operation='detect_dead_blog', influencer=inf) as opr: success = None data = {} try: r = requests.get(inf.blog_url, timeout=30) except: success = False else: success = r.status_code == 200 data['status_code'] = r.status_code log.info('detect_dead_blog result for %r: %s', inf.blog_url, success) data['success'] = success previous_runs = inf.platformdataop_set.filter(operation='detect_dead_blog', finished__isnull=False, error_msg__isnull=True).order_by('-finished')\ [:DEAD_BLOG_TESTS_TO_BLACKLIST - 1] if len(previous_runs) < DEAD_BLOG_TESTS_TO_BLACKLIST - 1: log.info('Not enough previous ops to check if should be disabled') else: recent_successes = [success] + [json.loads(pdo.data_json)['success'] for pdo in previous_runs] if all(x == False for x in recent_successes): log.warn('%d consecetive failures, blacklisting %r', DEAD_BLOG_TESTS_TO_BLACKLIST, inf) inf.blacklisted = True inf.save() data['blacklisted'] = True opr.data = data
def _handle_post_url(inf, to_save): path = urlparse.urlsplit(inf.blog_url).path.rstrip('/') if not path: return False try: dres = fetcher.try_detect_platform_name(inf.blog_url) if dres is None: return False platform_name, corrected_url = dres if platform_name is None: return False if platform_name not in ('Blogspot', 'Wordpress'): return False if to_save: with platformutils.OpRecorder(operation='dup_pair.handle_post_url', influencer=inf) as opr: orig_parsed = urlparse.urlsplit(inf.blog_url) new_parsed = orig_parsed._replace(path='') new_url = urlparse.urlunsplit(new_parsed) opr.data = {'orig_url': inf.blog_url, 'new_url': new_url} helpers.update_blog_url(inf, new_url) inf.handle_duplicates() return True except: log.exception('While handle_post_url') return False
def extract_emails_from_platform(platform_id=None, platform_object=None, to_save=True, disable_cleanup=False): assert platform_id is not None or platform_object is not None pl = models.Platform.objects.get(id=int(platform_id)) \ if platform_id is not None \ else platform_object try: with platformutils.OpRecorder('extract_emails_from_platform', platform=pl) as opr: with xbrowsermod.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, disable_cleanup=disable_cleanup) as xb: found_emails = [] ee1 = FromAboutPagesExtractor(xb, pl) ee1.update_influencers_email(to_save=to_save) found_emails += ee1.found_emails ee2 = FromCommonPostsExtractor(xb, pl) ee2.update_influencers_email(to_save=to_save) found_emails += ee2.found_emails found_emails = filter_emails(found_emails) opr.data = {'found_emails': found_emails} return found_emails except Exception as e: log.exception(e, extra={ 'platform_id': platform_id, 'to_save': to_save, 'disable_cleanup': disable_cleanup })
def _prepare_test_influencer(self, op='created_for_testing'): infs = models.Influencer.objects.filter(relevant_to_fashion=True, show_on_search=False, source__isnull=False, classification='blog', blacklisted=False).\ exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS).\ exclude(validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).\ order_by('-id') log.info('%d infs', infs.count()) assert infs.exists() inf_to_recreate = infs[0] self.orig_inf = inf_to_recreate log.info('Recreating influencer %r', inf_to_recreate) orig_blog_url = inf_to_recreate.blog_url orig_source = inf_to_recreate.source self._disable_inf(inf_to_recreate) self.inf = helpers.create_influencer_and_blog_platform( orig_blog_url, orig_source, to_save=True, platform_name_fallback=True) assert self.inf is not None with platformutils.OpRecorder(operation=op, influencer=self.inf) as opr: opr.data = {'source_influencer_id': inf_to_recreate.id} log.info('New influencer for testing: %r', self.inf)
def import_from_post_content(post_id, to_save=True): global _DOMAINS_OF_POPULAR_BRANDS if _DOMAINS_OF_POPULAR_BRANDS is None: log.info('Starting loading _DOMAINS_OF_POPULAR_BRANDS') popular_brands = models.Brands.objects.\ filter(blacklisted=False).\ filter(num_items_shelved__gte=5).\ exclude(name='www').\ annotate(num_products=Count('productmodel')).\ order_by('-num_products')[:100] _DOMAINS_OF_POPULAR_BRANDS = [ utils.domain_from_url(b.domain_name) for b in popular_brands ] log.info('Finished loading _DOMAINS_OF_POPULAR_BRANDS') post = models.Posts.objects.get(id=int(post_id)) with platformutils.OpRecorder(operation='import_from_post_content', post=post) as opr: log.info('import_from_post_content for %r', post) _do_import_from_content(post.content, opr, to_save, blacklisted_domains=BLACKLISTED_DOMAINS + _DOMAINS_OF_POPULAR_BRANDS + estimation.URL_FRAGMENTS_NO_RESOLVING + estimation.URL_FRAGMENTS_REQUIRING_RESOLVING + estimation.URL_FRAGMENTS_IN_IFRAMES)
def detect_platform_lang(platform_id): platform = models.Platform.objects.get(id=int(platform_id)) with platformutils.OpRecorder(operation='detect_platform_lang', platform=platform) as opr: posts = platform.posts_set.all()[:POSTS_TO_CHECK] if len(posts) < MIN_DETECTED_FACTOR * POSTS_TO_CHECK: log.warn('Not enough posts to check: %d', len(posts)) return langs = [] for p in posts: if not p.content: continue text = xutils.strip_html_tags(p.content) lang = detect_language(text) log.info('Lang %r detected from content %r', lang, text) langs.append(lang) log.info('All langs: %r', langs) valid_langs = [l for l in langs if l != 'UNKNOWN'] if not valid_langs: log.warn('Cannot detect language for any post') return lang_counter = collections.Counter(valid_langs) most_common_lang, most_common_counter = lang_counter.most_common(1)[0] log.info('Most common lang: %r, count: %d', most_common_lang, most_common_counter) if most_common_counter >= len(posts) * MIN_DETECTED_FACTOR: log.info('Count is high enough to set content_lang') platform.content_lang = most_common_lang platform.save() else: log.warn('Count IS NOT high enough to set content_lang')
def submit_daily_social_platform_update_tasks(submission_tracker): """ Get the Gplus and Bloglovin platforms we need to fetch and submit tasks for them. We are not really fetching posts here -- just updating platform info. We select platforms that have never had their info fetched and ones for which we have not done so for over a month. """ with platformutils.OpRecorder(operation='submit_daily_fetch_tasks') as opr: counter = TaskCounter() with submission_tracker.operation('gplus_fetch'): gplus_plats = debra.models.Platform.objects.all( ).gplus_update_pending() gplus_plats = _do_submit_daily_fetch_tasks(counter, submission_tracker, gplus_plats, queue_type='every_day') log.info('Gplus: {}'.format(len(gplus_plats))) with submission_tracker.operation('bloglovin_fetch'): bloglovin_plats = debra.models.Platform.objects.all( ).bloglovin_update_pending() bloglovin_plats = _do_submit_daily_fetch_tasks( counter, submission_tracker, bloglovin_plats, queue_type='every_day') log.info('Bloglovin: {}'.format(len(bloglovin_plats))) all_plats = gplus_plats + bloglovin_plats opr.data = {'tasks_submitted': len(all_plats)}
def update_url_if_redirected(plat_id, update=False): """ If the platform.url gets redirected to a new one, we should update the platform.url :param plat_id: is the platform id :param update: if True, we update the platform object """ plat = Platform.objects.get(id=plat_id) with platformutils.OpRecorder(operation='update_url_if_redirected', platform=plat) as opr: try: resp = requests.get(plat.url) if is_page_content_valid(resp.text): log.info('Page content is valid') else: log.info('Invalid page content for %r, removing ADMIN_TABLE_INFLUENCER_INFORMATIONS', plat) plat.influencer.remove_from_validated_on( constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS) plat.influencer.save() opr.data = {'res': 'invalid_page_content'} return if resp.status_code != 200: log.warn('HTTP status code is not 200 for platform %r', plat) opr.data = {'res': 'invalid_status_code'} return new_url = resp.url if new_url == plat.url: new_url = detect_user_level_redirect(plat.url) if new_url != plat.url and new_url.rstrip('/') != plat.url.rstrip('/'): print "\t\t\tNew url: %s, old_url: %s" % (new_url, plat.url) opr.data = {'res': 'detected_redirection', 'new_url': new_url, 'old_url': plat.url} InfluencerCheck.report(plat.influencer, plat, InfluencerCheck.CAUSE_URL_CHANGED, [], 'Old url: %r, new url: %r' % (plat.url, new_url)) if update and new_url: old_url = plat.url #plat.influencer.update_url_references(old_url, new_url) plat.url = new_url plat.validated_handle = None if plat.platform_name_is_blog: redetect_platform_name(plat, update) plat.save(bypass_checks=True) plat.handle_duplicates() # Update blog_urls also infs_to_update = list(Influencer.objects.filter(blog_url=old_url)) print "\t\t\tUpdating influencer's blog_url: %r" % infs_to_update for inf in infs_to_update: assert inf.blog_url == old_url and inf.blog_url inf.blog_url = new_url inf.save(bypass_checks=True) inf.handle_duplicates() except: log.exception('While checking redirect for %r', plat) # re-raise exception so it can be registered by OpRecorder raise
def scrape_platform_data(platform_id): platform = models.Platform.objects.get(id=int(platform_id)) with platformutils.OpRecorder(operation='scrape_data', platform=platform): if platform.platform_name not in PLATFORM_NAME_TO_SCRAPING_FETCHER: log.error('No scraping fetcher for platform_name %r', platform.platform_name) raise Exception('No scraping fetcher for platform_name %r' % platform.platform_name) sf = PLATFORM_NAME_TO_SCRAPING_FETCHER[platform.platform_name](platform) sf.scrape()
def submit_daily_fetch_tasks(): with platformutils.OpRecorder(operation='submit_daily_fetch_tasks') as opr: counter = TaskCounter() submission_tracker = TaskSubmissionTracker() query = debra.models.Platform.objects.all().for_daily_fetching() plats = _do_submit_daily_fetch_tasks(counter, submission_tracker, query) opr.data = {'tasks_submitted': len(plats)}
def fetch_for_posts(cls, posts): try: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: for post in posts: with platformutils.OpRecorder(operation='{0}_for_post'.format(cls.__name__.lower()), post=post): fetcher = cls(xb, post) yield fetcher.fetch_interactions() except Exception as e: log.exception(e, extra={'posts_len': len(posts)})
def run_handle_duplicates_for_influencer(influencer_id): influencer = Influencer.objects.get(id=influencer_id) with platformutils.OpRecorder(operation='handle_inf_duplicates', influencer=influencer) as opr: dups = Influencer.find_duplicates(influencer.blog_url, influencer.id) if dups: log.info('Found %d duplicates, running handle_duplicates') influencer.handle_duplicates() else: log.info('No duplicates found')
def fetch_pins_by_source(influencer_id): influencer = models.Influencer.objects.get(id=int(influencer_id)) try: with platformutils.OpRecorder('fetch_pins_by_source', influencer=influencer): f = PinsBySourceFetcher(influencer) f.fetch() except Exception as e: log.exception(e, exc_info=1, extra={'influencer_id': influencer_id})
def update_single_sponsorship(self, sponsorshipinfo_id): try: sp = debra.models.SponsorshipInfo.objects.get(id=sponsorshipinfo_id) with platformutils.OpRecorder(operation='update_single_sponsorship', post=sp.post) as opr: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: f = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[sp.widget_type](xb, sp.post) si = f.fetch_sponsorship(True) detect_sidebar_sponsorships(si) except SoftTimeLimitExceeded as exc: self.retry(exc=exc)
def cleanup(influencer_id): SOCIAL_PLATFORMS = ['Facebook', 'Pinterest', 'Twitter', 'Instagram'] influencer = Influencer.objects.get(id=influencer_id) with platformutils.OpRecorder('cleanup', influencer=influencer) as opr: for pname in SOCIAL_PLATFORMS: try: _do_cleanup(influencer, pname) except: log.exception('While _do_cleanup(%r, %r)', influencer, pname) pass
def submit_recrawl_campaigns_tasks(): """ Task to fetch recent posts for campaign-involved influencers. :return: """ with platformutils.OpRecorder( operation='submit_recrawl_campaigns_tasks') as opr: tasks_submitted = 0 submission_tracker = TaskSubmissionTracker() # getting platforms for those influencers, blog and social. bjps = BrandJobPost.objects.exclude(archived=True) for bjp in bjps: # fetching influencers involved in campaigns and their autovalidated social and blog platforms # inf_ids = list(bjp.candidates.filter(campaign_stage=6).values_list('mailbox__influencer__id', flat=True)) inf_ids = [ iid for iid in list( bjp.candidates.filter(campaign_stage__gte=3).values_list( 'mailbox__influencer__id', flat=True)) if iid is not None ] for inf_id in inf_ids: try: inf = Influencer.objects.get(id=inf_id) try: blog_platform_id = inf.blog_platform.id except (AttributeError, TypeError): blog_platform_id = None platform_ids = list( inf.platform_set.filter(autovalidated=True).exclude( url_not_found=True).values_list('id', flat=True)) if blog_platform_id is not None: platform_ids.insert(0, blog_platform_id) for plat in Platform.objects.filter(id__in=platform_ids): queue_name = '{}.{}'.format( RECRAWL_CAMPAIGNS_QUEUE_PREFIX, plat.platform_name) submission_tracker.count_task(queue_name) fetch_platform_data.apply_async(args=[plat.id, None], queue=queue_name) tasks_submitted += Platform.objects.filter( id__in=platform_ids).count() except Influencer.DoesNotExist: pass log.info('Tasks submitted: %s' % tasks_submitted) opr.data = {'tasks_submitted': tasks_submitted}
def visit_influencer(influencer_id, pdo_id): influencer = models.Influencer.objects.get(id=influencer_id) pdo = models.PlatformDataOp.objects.get(id=pdo_id) log.info('visit_influencer for %r', influencer) opr = platformutils.OpRecorder(_pdo=pdo) try: visit_url(influencer.blog_url) except: opr.register_exception() else: opr.register_success()
def blacklist_platforms_with_fetch_errors(): connection = db_util.connection_for_reading() cur = connection.cursor() cur.execute(SQL_PLATFORM_IDS_WITH_FETCH_ERRORS) log.info('%d plats to blacklist', cur.rowcount) for plat_id, in cur: plat = Platform.objects.get(id=plat_id) with platformutils.OpRecorder(operation='blacklist_platforms_with_fetch_errors', platform=plat) as opr: log.info('Blacklisting platform %r', plat) plat.url_not_found = True plat.save()
def reset_url_not_found_for_validated_instagram_platforms_with_ANY_error(): infs = Influencer.objects.filter( validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS ).exclude(blacklisted=True) plats = Platform.objects.filter(influencer__in=infs, platform_name='Instagram', url_not_found=True) for plat in plats: with platformutils.OpRecorder('reset_url_not_found', platform=plat): plat.url_not_found = None plat.save() print 'processed %r', plat
def fetch_products_from_post(post_id, shelf_user_id): """ 1. We first figure out if this post is a sponsored post or not (using simple keyword matching) 2. Next, we search for any widgets that the blogger has. If yes, we search for products inside them (sponsorshipfetcher.get_product_urls) 3. Next, we search for product urls in the post content 4. Now we iterate over all these product urls """ log.debug("Fetching products from post_id %s" % post_id) post = Posts.objects.select_related('influencer', 'influencer__shelf_user', 'platform').get(id=post_id) with platformutils.OpRecorder('fetch_products_from_post', post=post): _do_fetch_products_from_post(post, shelf_user_id)
def tag_post(post_id): post = models.Posts.objects.get(id=int(post_id)) with platformutils.OpRecorder(operation='tag_post', post=post) as opr: #assert not post.contenttag_set.exists(), 'Tags for this post were already computed' cts = tag_content(DEFAULT_TAGGERS, post.url, post.content, post=post) for ct in cts: if not models.ContentTagCount.objects.filter( platform=post.platform, tag=ct.tag).exists(): models.ContentTagCount.objects.create(platform=post.platform, tag=ct.tag, count=1) else: ctc = models.ContentTagCount.objects.filter( platform=post.platform, tag=ct.tag).update(count=F('count') + 1) return cts
def tag_influencer(influencer_id, to_save=False): influencer = models.Influencer.objects.get(id=int(influencer_id)) with platformutils.OpRecorder(operation='tag_influencer', influencer=influencer) as opr: fat = FilterAdjectivesTagger() if not influencer.blog_platform: log.error('No blog platform for %r', influencer) return None posts = influencer.blog_platform.\ posts_set.order_by('-create_date')[:POSTS_FOR_INFLUENCER_TAGGING] urls_contents = [(p.url, p.content) for p in posts if p.content] tags = fat.discover_tags_from_fragments(urls_contents) tags = _include_parent_tag(tags) log.info('All tags: %s', tags) if to_save: save_content_tags(tags, influencer=influencer) return fat.debug_info
def check_if_copyrightable_content(influencer_id): influencer = models.Influencer.objects.get(id=int(influencer_id)) with platformutils.OpRecorder(operation='check_if_copyrightable_content', influencer=influencer): res = False r = requests.get(influencer.blog_url, timeout=20) text = xutils.strip_html_tags(r.text) if u'©' in text: res = True else: words = nltk.wordpunct_tokenize(text) if 'copyright' in words: res = True log.info('Copyrightable content for %r: %r', influencer, res) influencer.copyrightable_content = res influencer.save() return res
def handle_influencer_demographics(inf, diff_only=False): with platformutils.OpRecorder('normalize_location', influencer=inf) as opr: if not inf.demographics_location: log.warn('No location to process') return loc = get_location_data(inf.demographics_location) log.info(u'Got location from {}: {}'.format(inf.demographics_location, loc)) if loc is None or not loc.address: log.warn(u'Location not geocoded: {}'.format( inf.demographics_location)) return address_components = loc.raw['address_components'] changed = False if loc.raw is not None: country = extract_address_component('country', address_components) state = extract_address_component('administrative_area_level_1', address_components) city = extract_address_component('locality', address_components) try: locality, created = models.DemographicsLocality.objects.get_or_create( country=country, state=state, city=city) except models.DemographicsLocality.MultipleObjectsReturned: locality = models.DemographicsLocality.objects.filter( country=country, state=state, city=city)[0] created = False changed = inf.demographics_locality != locality inf.demographics_locality = locality # changed = changed or inf.demographics_location_normalized != loc.address inf.demographics_location_normalized = loc.address if loc.latitude is not None: changed = changed or inf.demographics_location_lat != loc.latitude inf.demographics_location_lat = loc.latitude if loc.longitude is not None: changed = changed or inf.demographics_location_lon != loc.longitude inf.demographics_location_lon = loc.longitude if (diff_only and changed) or not diff_only: inf.save() return changed
def search_for_sponsorship(self, post_id): res = [] try: post = debra.models.Posts.objects.get(id=post_id) with platformutils.OpRecorder(operation='search_for_sponsorship', post=post) as opr: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: for f_cls in SPONSORSHIP_FETCHER_CLASSES: f = f_cls(xb, post) try: fres = f.fetch_sponsorship(True) if fres is not None: res.append(fres) detect_sidebar_sponsorships(fres) except: log.exception('While search_for_sponsorship') except SoftTimeLimitExceeded as exc: self.retry(exc=exc) return res
def classify_model(brand_id=None, influencer_id=None): # we are processing only one id, either an influencer or a brand assert brand_id is not None or influencer_id is not None assert not (brand_id is not None and influencer_id is not None) opr_kwargs = {'operation': 'content_classification'} if brand_id is not None: m = models.Brands.objects.get(id=brand_id) url = 'http://%s' % m.domain_name opr_kwargs['brand'] = m else: m = models.Influencer.objects.get(id=influencer_id) url = m.blog_url opr_kwargs['influencer'] = m with platformutils.OpRecorder(**opr_kwargs) as opr: c = Classifier() res = c.classify(url) log.info('Classified object %r url %r as %r', m, url, res) opr.data = {'result': res} m.save_classification(res)