def _contains_blog_keyword_canonical_page(self, target): """ This variant of _contains_blog_keyword works with scriptless version of page if it is implemented by the site :param target: :return: """ # we have <meta name="fragment" content="!">, loading and checking blog words in canonical version of the page if len(target.tree.xpath( "//meta[@name='fragment'][@content='!']")) > 0: log.info('Checking canonical page for tags') try: url_parsed = urlparse.urlparse(target.url) new_url = '%s://%s/?_escaped_fragment_' % (url_parsed.scheme, url_parsed.netloc) try: r = requests.get(new_url, timeout=10, headers=utils.browser_headers()) except SSLError: r = requests.get(new_url, timeout=10, headers=utils.browser_headers(), verify=False) content = r.content.lower() for kw in BLOG_KEYWORDS: log.info('Checking %s' % kw) if kw.lower() in content: log.info('Found blog keyword %r in canonical page', kw) return True except Exception as e: log.exception(e) return False
def get_profile(self, profile_id): if not profile_id.isdigit() and not profile_id.startswith('+'): profile_id = '+' + profile_id url = self.profile_url.format(profile_id) try: response = requests.get(url, headers=browser_headers()) except SSLError: # trying to fetch it with verify=True if we encounter SSLError response = requests.get(url, headers=browser_headers(), verify=False) return self.parse_profile(response.content.decode('utf-8'))
def get_description(cls, url, xb=None): """ Getting description field from Youtube. For now, we're just collecting links to other platforms so that we can validate if this url belongs to the blog. """ # remove query params # e.g.: http://www.youtube.com/user/zoella280390?feature=mhee => http://www.youtube.com/user/zoella280390 url = utils.remove_query_params(url) if url.endswith('/'): about_page = url + "about" else: about_page = url + "/about" res = set() try: r = requests.get(about_page, verify=False) tree = lxml.html.fromstring(r.content) social_links = tree.xpath( '//a[contains(@class,"about-channel-link")]/@href') for s in social_links: res.add(s) except SSLError: # encountered SSLError - retrying with verify=False r = requests.get(about_page, headers=utils.browser_headers(), verify=False) tree = lxml.html.fromstring(r.content) social_links = tree.xpath( '//a[contains(@class,"about-channel-link")]/@href') for s in social_links: res.add(s) return '\n'.join(res)
def _get_twitter_page(screen_name): twitter_url = 'https://twitter.com/%s' % screen_name r = requests.get(twitter_url, headers=utils.browser_headers()) # Poor man's throttling. Just wait 2 seconds. time.sleep(2) return r.content
def fetch_social_url(url, timeout=10): kwargs = {'url': url, 'timeout': timeout, 'verify': False} # TODO: these two lines were commented out before if social_platform_name_from_url(None, url) != 'Facebook': kwargs['headers'] = utils.browser_headers() r = requests.get(**kwargs) return r
def fetch_title_simple(url): """This function uses a simpler algorithm in BeatifulSoup to avoid parsing errors in lxml. """ r = requests.get(url, timeout=20, headers=utils.browser_headers(), verify=False) soup = BeautifulSoup(r.text) return soup.title.string
def _get(self): try: # setting verify=True to bypass SSL certificate validation for some blogs # http://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification r = requests.get(self.url, timeout=feedparsing.FEED_FETCH_TIMEOUT, headers=utils.browser_headers(), verify=False) r.raise_for_status() self.content = r.content self.redirected_url = r.url except requests.exceptions.RequestException: log.exception('Error in feed resolution fetching %r', self.url)
def fetch(self): attempts = 0 while attempts < 3: try: try: r = requests.get(self.url, timeout=10, headers=utils.browser_headers()) except SSLError: r = requests.get(self.url, timeout=10, headers=utils.browser_headers(), verify=False) self.content = r.content self.http_status_code = r.status_code self.http_headers = r.headers attempts = 9999 except: log.exception( 'While fetching content for classification from: {}'. format(self.url)) attempts += 1
def scrape_pin_source(post_id): post = models.Posts.objects.get(id=int(post_id)) r = requests.get(post.url, headers=utils.browser_headers()) tree = lxml.html.fromstring(r.text) anchor_els = tree.xpath('//div[@class="sourceFlagWrapper"]/a') if not anchor_els: log.warn('No anchor els') return href = anchor_els[0].attrib.get('href') if not href: log.warn('No href') return post.pin_source = utils.remove_fragment(href) post.save() log.info('Saved pin source %r', post.pin_source)
def fetch_title(url=None, content=None): """This function must be given either an url, or downloaded content """ if content is None: assert url is not None r = requests.get(url, timeout=5, headers=utils.browser_headers(), verify=False) content = r.text tree = lxml.html.fromstring(content) title_els = tree.xpath('//title') if not title_els: return None title = (title_els[0].text or '').strip() if not title: return None return title
def read(self): # TODO: raising a chardet unicode detection error on 404 feed responses # ValueError: Expected a bytes object, not a unicode object # e.g. http://www.hautemimi.com/feed/asdfasdfas try: # We encountered this kind of 'smart' feeds: http://feeds.feedblitz.com/freebiefindingmom # It will render html page when finds user-agent, otherwise it provides an xml Atom feed. # So trying to checking url's domain and then behave correspondingly # setting verify=True to bypass SSL certificate validation for some blogs # http://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification is_feedblitz = urlparse(self.url).netloc == 'feeds.feedblitz.com' r = requests.get( self.url, timeout=FEED_FETCH_TIMEOUT, headers=None if is_feedblitz else utils.browser_headers(), verify=False) # extra check if it was a redirect to feedblitz if not is_feedblitz and urlparse( r.url).netloc == 'feeds.feedblitz.com': self.url = r.url r = requests.get(self.url, timeout=FEED_FETCH_TIMEOUT, verify=False) r.raise_for_status() self.content = r.content self.headers = r.headers except requests.RequestException: self.content = b'' self.headers = {} return self.content
def is_url_a_blog(self, url=None, profile=None): """ Checks if url is a blog :param url: :return: """ log.info('Checking if url is blog: %s' % url) # checking if these urls are real and working try: resp = requests.get(url=url, headers=browser_headers(), timeout=15, verify=False) if resp.status_code < 400: # looks like this url is OK url_parsed = urlparse(resp.url) dmn = url_parsed.netloc if dmn.lower().endswith('.livejournal.com'): # looks like we found a LiveJournal blog return True, resp.url # normalizing domain for blogspot if 'blogspot.com' in dmn: dmn = '%s.blogspot.com' % dmn.split('.blogspot.com')[0] if dmn.endswith('.blogspot.com') or dmn.endswith( '.wordpress.com'): # If we found some blogspot or wordpress url at this moment - considering it best hit return True, url_parsed._replace(netloc=dmn).geturl() if '<!-- This is Squarespace. -->' in resp.content: # looks like we found a Squarespace blog return True, resp.url # checking if 'blog' is in root domain (2nd level domain) root_domain = dmn.split('.')[-2] if len( dmn.split('.')) >= 2 else None if root_domain is not None and 'blog' in root_domain: # high chances that this is a blog return True, resp.url if profile is not None: # if liketoknow hashtag appears in the profile's description, then it's a blogger for sure desc = profile.get_description_from_api() if '#liketoknow' in desc.lower(): return True, resp.url classification = self.url_classifier.classify(url=resp.url) if classification == 'blog': # looks like our Classifier defined it as blog return True, resp.url # TODO: Unreliable? May be use here some regexp */*blog*/* in the path (not for now)? # if 'blog' in dmn or '/blog/' in url_parsed.path: # # Looks like it is some blog? # best_result = url_parsed.geturl() # result.add(dmn) # break return None, None else: # removing urls going to domain not found log.info('url %s returned %s code, skipping it' % (url, resp.status_code)) return None, None except Exception as e: log.exception(e) return None, None
def _do_import_from_blogger_profile(blogger_profile_url, opr, to_save=True): log.info('Processing profile %r', blogger_profile_url) r = requests.get(blogger_profile_url, headers=utils.browser_headers(), proxies=get_proxy_config()) blogurls_names = [] if utils.domain_from_url(r.url) == 'plus.google.com': gplus_user_id = r.url.rstrip('/').split('/')[-1] gplus_user = requests.get( GOOGLE_PLUS_PEOPLE_TEMPLATE.format(user_id=gplus_user_id)).json() log.info('Got gplus data:\n%s', pprint.pformat(gplus_user)) if not gplus_user.get('urls'): log.warn('No gplus urls') return blog_url = gplus_user['urls'][0]['value'] name = gplus_user['displayName'] log.info('Gplus url and name: %r %r', blog_url, name) blogurls_names.append((blog_url, name)) else: tree = lxml.html.fromstring(r.content) name_els = tree.xpath('//div[@class="vcard"]//h1') if not name_els: log.warn('No name els') name = None else: name = name_els[0].text.strip() if not name: log.warn('Empty name') log.info('Blogger name: %r', name) blog_url_els = tree.xpath('//a[contains(@rel, "contributor-to")]') if not blog_url_els: log.warn('No blog url') utils.write_to_file('/tmp/last_no_blog.html', r.text) blog_url = None if r.text.strip().lower() == 'proxy authorization required': raise Exception('Proxy error') else: for el in blog_url_els: blog_url = el.attrib['href'].strip() log.info('Blog url: %r', blog_url) blogurls_names.append((blog_url, name)) if ALSO_CRAWL_OTHER_BLOGS_FOLLOWED: observed_els = tree.xpath('//li[@class="sidebar-item"]/a') for el in observed_els: blogurls_names.append((el.attrib.get('href'), None)) log.info('Collected blogurls_names: %r', blogurls_names) data = {'inf_id_existing': [], 'inf_id_created': []} for blog_url, name in blogurls_names: if not blog_url: continue blog_pl_name = fetcher.create_platforms_from_urls( [blog_url], True)[0].platform_name dup_infs = models.Influencer.find_duplicates(blog_url, exclude_blacklisted=False) if helpers.all_blacklisted(dup_infs): log.error( 'All duplicate influencers blacklisted for url %r, not importing', blog_url) continue if dup_infs: inf = helpers.select_valid_influencer(dup_infs) log.warn('Existing inf found: %r', inf) data['inf_id_existing'].append(inf.id) else: inf = models.Influencer(blog_url=blog_url, name=name, source='comments_import') log.info('Created new influencer %r', inf) data['inf_id_created'].append(inf.id) if to_save: inf.save() blog_pl_dups = models.Platform.find_duplicates(inf, blog_url, blog_pl_name) if blog_pl_dups: log.warn('Blog platform with url %r is already inserted: %r', blog_url, blog_pl_dups) continue blog_pl = models.Platform(platform_name=blog_pl_name, url=blog_url, influencer=inf) log.info('Created new platform %r', blog_pl) if to_save: blog_pl.save() opr.data = data time.sleep(SLEEP_AFTER_PROCESSING_BLOGGER)
def create_profile(self, url=None, tag=None, category=None, pipeline_class=None, **kwargs): """ Creating profile by Instagram post url or raw content object (in future, if needed) """ def append_hashtags_mentions_commentors_to_description( profile, hashtags, mentions, commentors): if not hashtags: hashtags = [] if not mentions: mentions = [] if not commentors: commentors = [] for h in hashtags: log.info("checking hashtags %r" % h) if not profile.profile_description: # print("Adding hashtag from own comment: %r" % h) profile.profile_description = h if profile.profile_description and not h in profile.profile_description: # print("Adding hashtag from own comment: %r" % h) profile.profile_description += ' ' + h for m in mentions: log.info("checking mentions %r" % m) if not profile.profile_description: # print("Adding mentions from own comment: %r" % m) profile.profile_description = '@' + m if profile.profile_description and not '@' + m in profile.profile_description: # print("Adding mentions from own comment: %r" % m) profile.profile_description += ' @' + m # save commentors as well with !*_<username> type for c in commentors: log.info("checking commentor %r" % c) if not profile.profile_description: # print("Adding commentor: %r" % c) profile.profile_description = '!*_' + c if profile.profile_description and not '!*_' + c in profile.profile_description: # print("Adding commentor: %r" % c) profile.profile_description += ' !*_' + c profile.save() def find_hashtags_mentions_commentors_in_comments( post_username, post_user_id, comments): hashtags = [] mentions = [] commentors = set() for j in comments: m = j['user']['id'] commentor = j['user']['username'] commentors.add(commentor) content = '' if post_user_id == m: content += j['text'] if len(content) > 0: print("Content = %r" % content) if xutils.is_html(content): cleaned_content = xutils.strip_html_tags(content) print("Needed to clean it up, it's now: %r" % cleaned_content) else: cleaned_content = content set1 = find_hashtags(cleaned_content) set2 = find_mentions(cleaned_content) if set1: hashtags.extend(set1) if set2: mentions.extend(set2) return hashtags, mentions, commentors log.info("Scraping url: %s" % url) # getting category from kwargs, getting tag from kwargs, otherwise detecting it from url # category = kwargs.get('category', None) # tag = kwargs.get('tag', None) if not tag: # try to get tag from the url if 'tagged' in url: loc = url.find('tagged=') + len('tagged=') tag = url[loc:] log.info("No tag given, but found tag = %s from url %s" % (tag, url)) # getting page's content r = requests.get(url, headers=utils.browser_headers()) # Poor man's throttling. Just wait 2 seconds. time.sleep(2) # TODO: need some check of requests result # getting instagram data, post's mention(?) and creator's id soup = BeautifulSoup(r.content) instagram_data = self.__extract_instagram_data(soup) owner_data = instagram_data.get('PostPage')[0].get('media').get( 'owner') mention = owner_data.get('username') post_creator_id = owner_data.get('id') log.info("In %r found mention: %s and tag: %r and category: %s" % (url, mention, tag, category)) # creating pending profile using mention, tag and category - res, created = self.create_pending_profile(mention, tag) log.info('PROFILE_CHECK_01 created=%s id=%s date_created=%s' % (created, res.id, res.date_created)) if res.friends_count and res.friends_count < MINIMUM_FRIENDS_COUNT: log.info( "Small number of followers %d (lesser than %s) for %s, so returning" % (res.friends_count, MINIMUM_FRIENDS_COUNT, res.username)) return res, None # get hashtags & mentions from captions caption = instagram_data.get('PostPage')[0].get('media').get( 'caption', None) hashtags_in_caption = find_hashtags(caption) mentions_in_caption = find_mentions(caption) append_hashtags_mentions_commentors_to_description( res, hashtags_in_caption, mentions_in_caption, None) # get hashtags & mentions from comments made by the author herself (very common) comments = instagram_data.get('PostPage')[0].get('media').get( 'comments').get('nodes') hashtags_in_comments, mention_in_comments, commentors = find_hashtags_mentions_commentors_in_comments( mention, post_creator_id, comments) append_hashtags_mentions_commentors_to_description( res, hashtags_in_comments, mention_in_comments, commentors) if category: if res.tags and category in res.tags: log.info( "Category %r already exists in %r, let's not do more analysis" % (category, res)) return res, commentors #save the tag as well as the hashtag res.append_tag(category) if tag: append_hashtags_mentions_commentors_to_description( res, [tag], [], []) MentionInPost.objects.filter( platform_name='Instagram', mention=mention).update(influencer_imported=True) # If this profile was freshly-created and has required prerequisites (has desired number of followers, etc), # its hashtags and mentions were set, pipeline_class provided, # then we initiate Pipeline performance of this profile. if created and pipeline_class is not None: try: # adding pipeline tag for profile to know from which pipeline it came res.append_tag('PIPELINE_%s' % pipeline_class) # getting a 'pipeline' by its name log.info('Loading pipeline %s for profile %s' % (pipeline_class, res.id)) pipeline_cls = locate('social_discovery.pipelines.%s' % pipeline_class) # creating an 'objekt' of the class pipeline = pipeline_cls() log.info('Running pipeline %s for profile %s' % (pipeline_class, res.id)) # calling the required function with appropriate params pipeline.run_pipeline(res.id) except KeyError: log.error('Pipeline %s not found' % pipeline_class) log.info('PROFILE_CHECK_02 created=%s id=%s date_created=%s' % (created, res.id, res.date_created)) return res, commentors
def get_youtube_channel_for_url(url=None): """ Returns youtube channel url by its video url if it is valid youtube video url. Channel urls are like: https://www.youtube.com/channel/UCIzI6LQzuudmdbtXhuZpBmA/videos http://www.youtube.com/user/raechelmyers/videos https://www.youtube.com/c/Minimalistbaker Video urls are like: https://www.youtube.com/watch?v=Fky6hpTlBZU https://youtu.be/Fky6hpTlBZU http://y2u.be/Fky6hpTlBZU :param url: :return: """ good_video_urls_regexp = r'(?:.*youtube.com\/watch\?.+|.*youtu.be\/.+|.*y2u.be\/.+)' good_channel_urls_regexp = r'(?:.*youtube.com\/channel\/.+|.*youtube.com\/user\/.+|.*youtube.com\/c\/.+)' if url is None: return None elif re.match(good_channel_urls_regexp, url): # it is already a channel url return url elif re.match(good_video_urls_regexp, url): # it is a video url, fetching channel's url with XBrowser # it is a video url, fetching channel's url with XBrowser import requests import lxml.html # need headers={...}/verify=False, otherwise it generates SSLError: # bad handshake: Error([('SSL routines', 'SSL3_GET_SERVER_CERTIFICATE', 'certificate verify failed')],) # and will not return a resultative response. r = requests.get(url, headers=utils.browser_headers(), verify=False) tree = lxml.html.fromstring(r.content) elems = tree.xpath("//div[@class='yt-user-info']/a") if elems and len(elems) > 0: elem = elems[0] v = elem.attrib.get('href') if v: channel_url = "https://www.youtube.com" + v return channel_url return None # with xbrowser.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, # load_no_images=True, timeout=10) as xb: # # # setting timeouts to xb instance # xb.driver.set_script_timeout(5) # xb.driver.implicitly_wait(5) # # xb.driver.get(url) # module_time.sleep(2) # # channel_node = xb.driver.find_element_by_xpath("//div[@class='yt-user-info']/a") # channel_node.click() # module_time.sleep(2) # channel_url = xb.driver.current_url # return channel_url else: return None