def create_single_platform_from_url(url, use_api=False, platform_name_fallback=False): """ :return: a :class:`debra.models.Platform` instance (not saved) :param url: an url for which to create a platform :param use_api: if to use api calls to detect platform_name (can result in an exception) :param platform_name_fallback: if platform_name cannot be detected, this tells if a platform with platform_name ``None`` should be created. :raises UnknownPlatformName: when ``platform_name_fallback == False`` and platform_name could not be detected :raises FetcherException: when ``platform_name_fallback == False`` and ``use_api == True`` and there was a fetcher error during platform name detection :raises requests.RequestException: when url resolving fails """ # This can result in an exception, so we'are skipping this # url = utils.resolve_http_redirect(url) social_pl_name = platformutils.social_platform_name_from_url(None, url) if social_pl_name != platformutils.PLATFORM_NAME_DEFAULT: return models.Platform(platform_name=social_pl_name, url=url) handle = platformutils.url_to_handle(url) if handle.endswith('blogspot.com'): return models.Platform(platform_name='Blogspot', url=url) if handle.endswith('wordpress.com'): return models.Platform(platform_name='Wordpress', url=url) if handle.endswith('tumblr.com'): return models.Platform(platform_name='Tumblr', url=url) # checking for Squarespace platform # Checking for 'This is Squarespace.' in page's content. try: r = requests.get(url=url, timeout=20, headers=REQUESTS_HEADERS) r.raise_for_status() if '<!-- This is Squarespace. -->' in r.content: return models.Platform(platform_name='Squarespace', url=url) except: pass if not use_api: if platform_name_fallback: return models.Platform(platform_name=None, url=url) raise UnknownPlatformName() assert use_api try: fc_name, _ = try_detect_platform_name(url) if fc_name: return models.Platform(platform_name=fc_name, url=url) if platform_name_fallback: return models.Platform(platform_name=None, url=url) raise UnknownPlatformName() except fetcherbase.FetcherException: if platform_name_fallback: return models.Platform(platform_name=None, url=url) raise
def setUp(self): self.influencer = models.Influencer() self.influencer.calculate_activity_level = Mock() self.influencer.save = Mock() self.platform = models.Platform() self.platform.influencer = self.influencer
def test_new(self): platform = models.Platform() platform = self.TestPlatform() platform.last_post_date = datetime.today() - timedelta(days=5) platform.insert_date = datetime.today() - timedelta(days=1) platform.calculate_activity_level() self.assertEqual(models.ActivityLevel.ACTIVE_NEW, platform.activity_level)
def influencer_platforms_from_url_fields(influencer): res = [] for platform_name, field_name in models.Influencer.platform_name_to_field.items( ): url_str = getattr(influencer, field_name) if not url_str: continue for url in url_str.split(): pl = models.Platform(influencer=influencer, platform_name=platform_name, url=url) res.append(pl) return res
def get_platform(): existing_platforms = models.Platform.objects.filter( url=platform_url, platform_name=platform_name) if existing_platforms.exists(): log.info('Follower\'s platform found in DB: %r', existing_platforms[0]) return existing_platforms[0] log.info('No existing %s platforms found for url %r', platform_name, platform_url) inf_q = models.Influencer.objects.filter(**influencer_search_kwargs) if inf_q.exists(): inf = inf_q[0] log.info('Found existing influencer: %r', inf) else: inf, inf_created = models.Influencer.objects.get_or_create( **influencer_create_kwargs) if inf_created: log.info('Created new influencer: %r', inf) inf.source = 'followers' inf.date_created = datetime.datetime.now() inf.save() else: log.info( 'Found existing influencer using influencer_create_kwargs: %r', inf) dup_platforms = models.Platform.find_duplicates( inf, platform_url, platform_name) if dup_platforms: log.info('Detected duplicated platforms: %r', dup_platforms) return dup_platforms[0] pl = models.Platform() pl.influencer = inf pl.platform_name = platform_name pl.url = platform_url pl.description = description pl.processing_state = models.Platform.PROCESSING_STATE_NEW_FOLLOWERS_PLATFORM pl.save() log.info('Created new platform: %r', pl) return pl
def create_platforms_from_urls(urls, use_api=False, platform_name_fallback=False): """ Returns a list of platforms created using :func:`create_single_platform_from_url` for a list of ``urls``. Processing stops when an exception happens (only possible if ``platform_name_fallback == True``). Exceptions occuring during url resolving cause skipping errnous urls. If ``platform_name_fallback == False`` and ``UnknownPlatformName`` is raised by ``create_single_platform_from_url``, a "Custom" platform_name is used. """ res = [] for url in urls: try: plat = create_single_platform_from_url(url, use_api, platform_name_fallback) res.append(plat) except requests.RequestException: log.exception('Exception during url resolving, skipping url %r', url) continue except UnknownPlatformName: res.append(models.Platform(platform_name='Custom', url=url)) return res
def test_row(self, row): url = row['url'] log.info('%s Processing', url) pl_candidates = models.Platform.objects.filter( url=url, platform_name='Blogspot') if not pl_candidates.exists(): log.warn('%s No Platform', url) if self.with_posts_only: return pl = models.Platform() pl.platform_name = 'TestData' pl.url = row['url'] else: pl = pl_candidates[0] if not pl.posts_set.exists(): log.warn('No posts for %r', pl) if self.with_posts_only: return emailextractor.tlocal._latest_validated, emailextractor.tlocal._latest_not_validated = None, None try: extracted = self.extraction_fun(platform_object=pl, to_save=False, disable_cleanup=self.procs == 1) except: log.exception('%s During platform extraction, skipping this row', url) self.error_urls.append(row['url']) return log.info('%s Extracted emails: %r', url, extracted) if emailextractor.tlocal._latest_validated is not None: for email in emailextractor.tlocal._latest_validated: self.validated[(pl.platform_name, reason)] += 1 if emailextractor.tlocal._latest_not_validated is not None: for pl in emailextractor.tlocal._latest_not_validated: self.not_validated[pl.platform_name] += 1 self.not_validated_pls.append((url, pl)) valid = row.get('valid_emails', '').split() log.info('%s Valid emails: %r', url, valid) extracted = [e.lower() for e in extracted] valid = [e.lower() for e in valid] if (not valid) and (not extracted): log.info('%s *** No usernames from both sources for %s', url, platform_name) elif set(valid) == set(extracted): log.warn('%s +++ Test passed', url) self.found += 1 elif (not valid): log.warn('%s ??? Test unknown', url) self.unknown += 1 elif valid and (not extracted): log.warn('%s --- Test fail', url) self.notfound += 1 elif set(valid) != set(extracted): log.warn('%s !!! Test incorrect', url) self.incorrect += 1
def import_blogurlsraw_single(blogurlsraw_id, to_save=True): m = models.BlogUrlsRaw.objects.get(id=int(blogurlsraw_id)) if not m.source: log.error('No source set for %r', m) return if not m.blog_url: log.error('No blog_url set for %r', m) return dup_infs = models.Influencer.find_duplicates(m.blog_url, exclude_blacklisted=False) if helpers.all_blacklisted(dup_infs): log.error( 'All duplicate influencers blacklisted for url %r, not importing', m.blog_url) return if dup_infs: inf = helpers.select_valid_influencer(dup_infs) log.warn('Existing inf found: %r', inf) else: inf = models.Influencer(blog_url=m.blog_url, name=m.name, source='blogurlsraw') if to_save: inf.save() blog_pls = fetcher.create_platforms_from_urls([m.blog_url], True) if blog_pls: blog_pl = blog_pls[0] blog_pl.influencer = inf else: log.warn( 'Could not create blog platform from blog_url %r, using Custom platform_name', m.blog_url) blog_pl = models.Platform(platform_name='Custom', url=m.blog_url, influencer=inf) log.info('Blog platform from blog_url: %r', blog_pl) if to_save: # This handles duplicates # set appropriate state blog_pl.platform_state = models.Platform.PLATFORM_STATE_STARTED blog_pl.save() pl = models.Platform(url=m.url, num_followers=m.num_followers, description=m.description, influencer=inf) if 'lookbook.nu' in m.source: pl.platform_name = 'Lookbook' elif 'fashiolista.com' in m.source: pl.platform_name = 'Fashiolista' else: assert False, 'unknown source %r' % m.source if to_save: # This handles duplicates pl.save() # site_url can contain additional social handle if m.site_url: site_pls = fetcher.create_platforms_from_urls([m.site_url], True) if site_pls: site_pl = site_pls[0] site_pl.influencer = inf if to_save: # This handles duplicates site_pl.save() m.have_been_processed = True if to_save: m.save()
def _do_import_from_content(content, opr, to_save, blacklisted_domains=BLACKLISTED_DOMAINS): """ This function creates new platforms from content provided by searching for urls (except those given in blacklisted_domains). Limitation: it works only for building new 'blog' platforms, and doesn't work for creating new social platforms """ if not content: log.warn('No content, doing nothing') return urls = contentfiltering.find_all_urls(content) log.info('Found %d urls: %r', len(urls), urls) platforms = [] for url in urls: log.info('Oring url: %r', url) try: url = utils.resolve_http_redirect(url) except: log.exception('While resolve_http_redirect, skipping') continue log.info('Redirected url: %r', url) vurl = platformutils.url_to_handle(url) if not vurl: log.info('No handle computed from url %r, skipping', url) continue domain = utils.domain_from_url(vurl) if domain in blacklisted_domains: log.info('Domain %r is blacklisted', domain) continue blog_url = utils.url_without_path(url) if domain.endswith('.wordpress.com'): platforms.append( models.Platform(platform_name='Wordpress', url=blog_url)) elif domain.endswith('.blogspot.com'): platforms.append( models.Platform(platform_name='Blogspot', url=blog_url)) else: content = xutils.fetch_url(blog_url) if content: discovered_pname = xutils.contains_blog_metatags(content) if discovered_pname: platforms.append( models.Platform(platform_name=discovered_pname, url=blog_url)) continue platforms.append( models.Platform(platform_name='Custom', url=blog_url)) influencers = [] influencers_created = [] for plat in platforms: inf, inf_created = helpers.get_or_create_influencer( plat.url, 'comments_content_import', to_save) if not inf: log.warn( 'Skipping url %r because influencer with this url is blacklisted', plat.url) continue plat.influencer = inf influencers.append(inf) if inf_created: influencers_created.append(inf) if opr: opr.data = { 'influencer_ids': [influencer.id for influencer in influencers], 'influencer_created_ids': [influencer.id for influencer in influencers_created], 'influencer_blog_urls': [influencer.blog_url for influencer in influencers], } log.info('Platforms from content: %r', platforms) if to_save: for plat in platforms: # influencer of None means we got a blacklisted influencer # when we searched by URL. if plat.influencer is not None: plat.save() return platforms
def _do_import_from_blogger_profile(blogger_profile_url, opr, to_save=True): log.info('Processing profile %r', blogger_profile_url) r = requests.get(blogger_profile_url, headers=utils.browser_headers(), proxies=get_proxy_config()) blogurls_names = [] if utils.domain_from_url(r.url) == 'plus.google.com': gplus_user_id = r.url.rstrip('/').split('/')[-1] gplus_user = requests.get( GOOGLE_PLUS_PEOPLE_TEMPLATE.format(user_id=gplus_user_id)).json() log.info('Got gplus data:\n%s', pprint.pformat(gplus_user)) if not gplus_user.get('urls'): log.warn('No gplus urls') return blog_url = gplus_user['urls'][0]['value'] name = gplus_user['displayName'] log.info('Gplus url and name: %r %r', blog_url, name) blogurls_names.append((blog_url, name)) else: tree = lxml.html.fromstring(r.content) name_els = tree.xpath('//div[@class="vcard"]//h1') if not name_els: log.warn('No name els') name = None else: name = name_els[0].text.strip() if not name: log.warn('Empty name') log.info('Blogger name: %r', name) blog_url_els = tree.xpath('//a[contains(@rel, "contributor-to")]') if not blog_url_els: log.warn('No blog url') utils.write_to_file('/tmp/last_no_blog.html', r.text) blog_url = None if r.text.strip().lower() == 'proxy authorization required': raise Exception('Proxy error') else: for el in blog_url_els: blog_url = el.attrib['href'].strip() log.info('Blog url: %r', blog_url) blogurls_names.append((blog_url, name)) if ALSO_CRAWL_OTHER_BLOGS_FOLLOWED: observed_els = tree.xpath('//li[@class="sidebar-item"]/a') for el in observed_els: blogurls_names.append((el.attrib.get('href'), None)) log.info('Collected blogurls_names: %r', blogurls_names) data = {'inf_id_existing': [], 'inf_id_created': []} for blog_url, name in blogurls_names: if not blog_url: continue blog_pl_name = fetcher.create_platforms_from_urls( [blog_url], True)[0].platform_name dup_infs = models.Influencer.find_duplicates(blog_url, exclude_blacklisted=False) if helpers.all_blacklisted(dup_infs): log.error( 'All duplicate influencers blacklisted for url %r, not importing', blog_url) continue if dup_infs: inf = helpers.select_valid_influencer(dup_infs) log.warn('Existing inf found: %r', inf) data['inf_id_existing'].append(inf.id) else: inf = models.Influencer(blog_url=blog_url, name=name, source='comments_import') log.info('Created new influencer %r', inf) data['inf_id_created'].append(inf.id) if to_save: inf.save() blog_pl_dups = models.Platform.find_duplicates(inf, blog_url, blog_pl_name) if blog_pl_dups: log.warn('Blog platform with url %r is already inserted: %r', blog_url, blog_pl_dups) continue blog_pl = models.Platform(platform_name=blog_pl_name, url=blog_url, influencer=inf) log.info('Created new platform %r', blog_pl) if to_save: blog_pl.save() opr.data = data time.sleep(SLEEP_AFTER_PROCESSING_BLOGGER)
def test_row(self, row): url = row['url'] log.info('%s Processing', url) pl_candidates = models.Platform.objects.filter(url=url, platform_name='Blogspot') if not pl_candidates.exists(): log.warn('%s No Platform', url) if self.with_posts_only: return pl = models.Platform() pl.platform_name = 'Outreach' pl.url = row['url'] else: pl = pl_candidates[0] if pl.posts_set.count() < 5: log.warn('No posts for %r', pl) if self.with_posts_only: return platformextractor.tlocal._latest_validated, platformextractor.tlocal._latest_not_validated = None, None try: extracted = self.extraction_fun(platform_object=pl, to_save=False, disable_cleanup=self.procs == 1) except: log.exception('%s During platform extraction, skipping this row', url) self.error_urls.append(row['url']) return if platformextractor.tlocal._latest_validated is not None: for pl, reason in platformextractor.tlocal._latest_validated: self.validated[(pl.platform_name, reason)] += 1 if platformextractor.tlocal._latest_not_validated is not None: for pl in platformextractor.tlocal._latest_not_validated: self.not_validated[pl.platform_name] += 1 self.not_validated_pls.append((url, pl)) log.info('%s Extracted platforms: %r', url, extracted) extracted_by_platform_name = defaultdict(list) for e in extracted: extracted_by_platform_name[e.platform_name].append(e) log.info('extracted_by_platform_name:\n%s', pformat(dict(extracted_by_platform_name))) for pname, pls in extracted_by_platform_name.items(): if len(pls) != 1: log.warn('999 %r Multiple platforms for a single platform_name %s: %s', url, pname, pls) for platform_name in ['Facebook', 'Twitter', 'Pinterest', 'Bloglovin', 'Instagram']: from_spreadsheet = row[platform_name].strip() if from_spreadsheet: username_from_spreadsheet = platformextractor.username_from_platform_url(from_spreadsheet) else: username_from_spreadsheet = '' from_extracted = extracted_by_platform_name[platform_name][0] \ if extracted_by_platform_name[platform_name] \ else '' if from_extracted: username_extracted = platformextractor.username_from_platform_url(from_extracted.url) else: username_extracted = '' if username_from_spreadsheet: username_from_spreadsheet = username_from_spreadsheet.lower() if username_extracted: username_extracted = username_extracted.lower() if (not username_from_spreadsheet) and (not username_extracted): log.info('%s *** No usernames from both sources for %s', url, platform_name) elif username_from_spreadsheet == username_extracted: log.warn('%s +++ Test passed: %s: spreadsheet: %r, extracted: %r', url, platform_name, username_from_spreadsheet, username_extracted) self.found[platform_name] += 1 elif (not username_from_spreadsheet): log.warn('%s ??? Test unknown: %s: spreadsheet: %r, extracted: %r', url, platform_name, username_from_spreadsheet, username_extracted) self.unknown[platform_name] += 1 elif username_from_spreadsheet and (not username_extracted): log.warn('%s --- Test fail: %s: spreadsheet: %r, extracted: %r', url, platform_name, username_from_spreadsheet, username_extracted) self.notfound[platform_name] += 1 elif username_from_spreadsheet != username_extracted: log.warn('%s !!! Test incorrect: %s: spreadsheet: %r, extracted: %r', url, platform_name, username_from_spreadsheet, username_extracted) self.incorrect[platform_name] += 1