def load(self, to_save=False): from debra.models import (InfluencersGroup, DemographicsLocality, Influencer, InfluencerBrandMapping,) fields_data = self.get_fields_data() if self.limit: fields_data = fields_data[:self.limit] infs = [] if to_save: tag = InfluencersGroup.objects.get( id=constants.R29_CUSTOM_DATA_TAG_ID) for data in fields_data: inf = Influencer(**data['influencer']) demographics_locality = DemographicsLocality.objects.get_or_create( **data['demographics'])[0] inf.demographics_locality = demographics_locality inf.save(bypass_checks=True) tag.add_influencer(inf) brand_mapping = InfluencerBrandMapping(**data['brand_mapping']) brand_mapping.influencer = inf brand_mapping.save() infs.append(inf) return infs
def handle_influencer_duplicates(typ): assert typ in ('validated', 'non_validated') if typ == 'validated': infs = Influencer.objects.filter( validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_INFORMATIONS).exclude( validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(blacklisted=True) elif typ == 'non_validated': infs = Influencer.objects.exclude( validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_INFORMATIONS).exclude( validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude( blacklisted=True).exclude(source__isnull=True).exclude( blog_url__isnull=True).exclude(blog_url='') else: assert False infs_count = infs.count() log.info('Handling duplicates for %r infs', infs_count) for i, inf in enumerate(infs.iterator()): log.info('Processing %d/%d %r', i + 1, infs_count, inf) try: if Influencer.find_duplicates(inf.blog_url, inf.id): inf.handle_duplicates() except: log.exception('While handling duplicates for %r, skipping', inf)
def prepare_rawblogurl(raw_blog): """ :param raw_blog: a `debra.models.BlogUrlsRaw` instance Check wether an influencer exist with that blog_url. If there's no influencer it creates one. Then get or creates a blog platform and mark the platform as INVESTIGATING. """ duplicate_infs = Influencer.find_duplicates(blog_url=raw_blog.blog_url) if duplicate_infs: inf = duplicate_infs[0] inf.handle_duplicates() else: inf = Influencer.objects.create(source='lookbook', blog_url=raw_blog.blog_url) print raw_blog.site_url # lookbook_plat = Platform.objects.get_or_create(url=raw_blog.blog_url, platform_name="lookbook", influencer=inf) # why? if 'wordpress' in raw_blog.blog_url.lower(): platform_name = 'Wordpress' elif 'blogspot' in raw_blog.blog_url.lower(): platform_name = 'Blogspot' else: platform_name = 'Custom' blog_platform, created = Platform.objects.get_or_create(url=raw_blog.blog_url, platform_name=platform_name, influencer=inf) blog_platform.platform_state = "INVESTIGATING" blog_platform.save() determine_platform_state(blog_platform)
def run_handle_duplicates_for_influencer(influencer_id): influencer = Influencer.objects.get(id=influencer_id) with platformutils.OpRecorder(operation='handle_inf_duplicates', influencer=influencer) as opr: dups = Influencer.find_duplicates(influencer.blog_url, influencer.id) if dups: log.info('Found %d duplicates, running handle_duplicates') influencer.handle_duplicates() else: log.info('No duplicates found')
def handle_influencer_duplicates_with_checks(max_id=999999): influencers = Influencer.objects.filter( validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS ).exclude(validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude( blacklisted=True).filter(id__lte=int(max_id)).order_by('-id') for influencer in influencers: dups = Influencer.find_duplicates(influencer.blog_url) if len(dups) in (0, 1): log.info('OK %r', influencer) continue log.info('%d dups for %r', len(dups), influencer) before_with_shelf_user = [ inf for inf in dups if inf.shelf_user is not None ] valid_platform_names_in_dups = { plat.platform_name for inf in dups for plat in inf.platform_set.exclude(url_not_found=True) } # run de-duplication selected = influencer.handle_duplicates() log.info('Selected: %r', selected) not_selected = [inf for inf in dups if inf.id != selected.id] assert len(not_selected) == len(dups) - 1 # refresh old dups objects and selected dups = [Influencer.objects.get(id=inf.id) for inf in dups] selected = Influencer.objects.get(id=selected.id) after_with_shelf_user = [ inf for inf in dups if inf.shelf_user is not None ] log.info('before/after with_shelf_user: %d %d', len(before_with_shelf_user), len(after_with_shelf_user)) assert len(before_with_shelf_user) <= len(after_with_shelf_user) valid_platform_names_in_selected = { plat.platform_name for plat in selected.platform_set.exclude(url_not_found=True) } log.info('platform_names in dups/selected: %s %s', valid_platform_names_in_dups, valid_platform_names_in_selected) assert valid_platform_names_in_dups == valid_platform_names_in_selected not_selected_validated = [ inf for inf in not_selected if not inf.is_enabled_for_automated_edits() ] log.info('Not selected validated: %s', not_selected_validated) # if selected is not validated, check if we are not disabling validated if selected.is_enabled_for_automated_edits(): assert not not_selected_validated
def set_source_spreadsheet(filename): reader = spreadsheet_reader(filename) for row in reader: duplicate_infs = Influencer.find_duplicates(blog_url=row['url']) if len(duplicate_infs) > 0: inf = duplicate_infs[0] if inf.source == 'spreadsheet_import': print 'Influencer %s has source set to spreadsheet_import' % inf else: inf.source = 'spreadsheet_import' inf.save() print 'Updated influencer %s source to spreadsheet_import' % inf else: print '!!! No influencers for blog_url=%s' % row['url']
def find_influencer_duplicates(): res = [] infs = Influencer.objects.filter( validated_on__contains=constants.ADMIN_TABLE_INFLUENCER_INFORMATIONS ).exclude(validated_on__contains=constants. ADMIN_TABLE_INFLUENCER_SELF_MODIFIED).exclude(blacklisted=True) for inf in infs: dups = Influencer.find_duplicates(inf.blog_url, inf.id) if dups: log.info('YES_DUP %s %r %r', inf.id, inf, dups) res.append(inf) else: log.info('NO_DUP %s %r', inf.id, inf) log.info('Total duplicates: %s', len(res)) pprint.pprint(res) return res
def blogger_autocomplete(request): """ similar to user_autocomplete, but for the fact that it operates on Influencers instead of UserProfile's @return json string containing search results """ term = request.GET.get('term') influencers = Influencer.raw_influencers_for_search() matched_influencers = influencers.filter(name__isnull=False).filter(Q(name__icontains=term) | Q(email__icontains=term) | Q(platform__blogname__icontains=term)) results = [] for inf in matched_influencers: results.append({ 'id': inf.id, 'name': unescape(inf.name), 'img': unescape(inf.name) }) return HttpResponse(status=200, content=json.dumps({'results': results}))
def create_influencers_platforms_from_csv(filename, from_row='1', to_row='999999'): """Works with https://docs.google.com/spreadsheet/ccc?key=0Ai2GPRwzn6lmdEMzWVR0aldXYXJodGplZlVGRVMyQ1E&usp=sharing . To download CSV, add output=csv to the link: https://docs.google.com/spreadsheet/ccc?key=0Ai2GPRwzn6lmdEMzWVR0aldXYXJodGplZlVGRVMyQ1E&usp=sharing&output=csv """ reader = spreadsheet_reader(filename) count = 0 from_row = int(from_row) to_row = int(to_row) for row in reader: print "\n\nCount: %d" % count count += 1 if count < from_row: print 'Skipping row %d' % count continue if count > to_row: print 'Skipping row %d' % count continue if row['email'] == 'email': # First title row continue if not (row['url'] or '').strip(): # Empty row continue print 'Processing row %r' % row duplicate_infs = Influencer.find_duplicates(blog_url=row['url']) if len(duplicate_infs) > 0: inf = duplicate_infs[0] inf.handle_duplicates() print 'Using already saved influencer: %r' % inf else: inf = Influencer() #update info inf.source = 'spreadsheet_import' inf.name = row['blogger_name'] inf.blog_url = row['url'] inf.email = row['email'] inf.demographics_location = row['location'] inf.demographics_gender = row['gender'] assert False, 'This script requires code update to *_url fields processing' if row['Facebook']: inf.fb_url = row['Facebook'] if row['Pinterest']: inf.pin_url = row['Pinterest'] if row['Twitter']: inf.tw_url = row['Twitter'] if row['Instagram']: inf.insta_url = row['Instagram'] if row['Bloglovin']: inf.bloglovin_url = row['Bloglovin'] inf.save() print 'Saved new influencer: %r' % inf # Try to save blog as platform if row['url']: blog_pl = Platform.objects.filter(url=row['url']) if blog_pl.exists(): print "Blog already exists for url %s [%s]" % (row['url'], blog_pl) else: discovered_pl, corrected_url = fetcher.try_detect_platform_name( row['url']) if discovered_pl: blog_pl = Platform.find_duplicates(inf, row['url'], discovered_pl) if blog_pl and len(blog_pl) > 0: blog_pl = blog_pl[0] blog_pl = blog_pl.handle_duplicates() else: blog_pl = Platform() blog_pl.influencer = inf blog_pl.platform_name = discovered_pl blog_pl.url = row['url'] blog_pl.blogname = row['blog_name'] blog_pl.save() print 'Saved platform from blog data: %r' % blog_pl else: print 'No platform discovered for blog url %r' % row['url'] for platform_name in ('Facebook', 'Twitter', 'Pinterest', 'Bloglovin', 'Instagram'): if not row[platform_name]: print 'No url for platform %r' % platform_name continue pl = Platform.find_duplicates(inf, row[platform_name], platform_name) if pl and len(pl) > 0: pl = pl[0] pl = pl.handle_duplicates() else: pl = Platform() pl.influencer = inf pl.platform_name = platform_name pl.url = row[platform_name] pl.save() print 'Saved new platform %r' % pl