def extract_emails_from_platform(platform_id=None, platform_object=None, to_save=True, disable_cleanup=False): assert platform_id is not None or platform_object is not None pl = models.Platform.objects.get(id=int(platform_id)) \ if platform_id is not None \ else platform_object try: with platformutils.OpRecorder('extract_emails_from_platform', platform=pl) as opr: with xbrowsermod.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, disable_cleanup=disable_cleanup) as xb: found_emails = [] ee1 = FromAboutPagesExtractor(xb, pl) ee1.update_influencers_email(to_save=to_save) found_emails += ee1.found_emails ee2 = FromCommonPostsExtractor(xb, pl) ee2.update_influencers_email(to_save=to_save) found_emails += ee2.found_emails found_emails = filter_emails(found_emails) opr.data = {'found_emails': found_emails} return found_emails except Exception as e: log.exception(e, extra={ 'platform_id': platform_id, 'to_save': to_save, 'disable_cleanup': disable_cleanup })
def get_blog_url_from_liketoknowit(liketoknowit_url=None, xb=None): """ Function to extract user's blog url from her http://liketoknow.it/<username> page. :param liketoknowit_url: url to liketoknowit page :return: blog url """ def get_the_blog_url(xb, liketoknowit_url): xb.load_url(liketoknowit_url) anchors = WebDriverWait( xb.driver, 10).until(lambda _: xb.els_by_xpath('//publisher-header//h5//a')) anchors = [a for a in anchors if a.get_attribute('href')] urls = utils.unique_sameorder(a.get_attribute('href') for a in anchors) return urls[0] if len(urls) > 0 else None if liketoknowit_url is None: return None try: if xb is None: with xbrowser.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xb: return get_the_blog_url(xb, liketoknowit_url) else: return get_the_blog_url(xb, liketoknowit_url) except Exception as e: log.exception(e, extra={'url': liketoknowit_url}) return None
def _create_scraper(self, product): op_recorder = ScraperOpRecorder(product) if self.reuse_xbrowser and getattr(xbrowser_storage, 'xbrowser', None): log.info('Reusing xbrowser') self.xbrowser = xbrowser_storage.xbrowser else: log.debug('Creating new xbrowser') try: self.xbrowser = xbrowser.XBrowser( driver=self.driver, headless_display=self.headless_display, auto_refresh=True if self.reuse_xbrowser else False) except: op_recorder.exception() raise if self.reuse_xbrowser: xbrowser_storage.xbrowser = self.xbrowser self.xbrowser.load_url(product.prod_url) if self.sleep_after_load: log.debug('Sleeping %d seconds before scraping', self.sleep_after_load) time.sleep(self.sleep_after_load) log.debug('Finished sleeping') if self.reuse_xbrowser: log.debug('Reusing xbrowser -- forcing a refresh.') self.xbrowser.xrefresh() if self.sleep_after_load: log.debug('Sleeping %d seconds before scraping', self.sleep_after_load) time.sleep(self.sleep_after_load) log.debug('Finished sleeping') self.scraper = scrapermod.Scraper(self.xbrowser, op_recorder)
def visit_url(url): xb = None try: xb = xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) xb.load_url('http://app.theshelf.com/internal/') xb.driver.execute_script("""var a = document.createElement('a'); a.href='%s'; a.id = 'blogvisitor_to_click'; a.innerHTML = 'I will click this'; document.body.appendChild(a);""" % url.replace("'", "\\'")) a = xb.driver.find_element_by_id('blogvisitor_to_click') a.click() xb.driver.back() except: log.exception('visit_url(url={}) got an exception'.format(url), extra={'url': url}) finally: try: if xb: xb.cleanup() except: log.exception( 'visit_url(url={}) got an exception while xb.cleanup()'.format( url))
def _update_num_following(self): try: with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: xb.load_url( 'https://www.youtube.com/{0}/channels?flow=grid&view=56'. format(self.platform.validated_handle)) while True: # potentially infinite loop? no_break = False try: button = WebDriverWait(xb.driver, 10).until( expected_conditions.presence_of_element_located( (By.CLASS_NAME, 'load-more-button'))) button.click() no_break = True continue finally: if not no_break: break self.platform.num_following = len( xb.execute_jsfun( '_XPS.evaluateXPath', '//li[contains(@class, "channels-content-item")]')) self.platform.save() except Exception as e: log.exception(e)
def extract_links_from_platform_url(platform_id): try: pl = models.Platform.objects.get(id=int(platform_id)) xb = xbrowsermod.XBrowser(pl.url) le = LinksFromPlatformUrlExtractor(xb, pl) le.extract_links() except Exception as e: log.exception(e, extra={'platform_id': platform_id})
def fetch_for_posts(cls, posts): try: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: for post in posts: with platformutils.OpRecorder(operation='{0}_for_post'.format(cls.__name__.lower()), post=post): fetcher = cls(xb, post) yield fetcher.fetch_interactions() except Exception as e: log.exception(e, extra={'posts_len': len(posts)})
def update_single_sponsorship(self, sponsorshipinfo_id): try: sp = debra.models.SponsorshipInfo.objects.get(id=sponsorshipinfo_id) with platformutils.OpRecorder(operation='update_single_sponsorship', post=sp.post) as opr: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: f = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[sp.widget_type](xb, sp.post) si = f.fetch_sponsorship(True) detect_sidebar_sponsorships(si) except SoftTimeLimitExceeded as exc: self.retry(exc=exc)
def run(): profiles = set() for location in LOCATIONS: xb = xbrowsermod.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, disable_cleanup=False) xb.load_url(ANGEL_LIST_INVESTOR_HOME) # now find the button to enter location print "now trying location %s" % location try: loc = xb.el_by_xpath('//input[@placeholder="Add Location"]') loc.send_keys(location) sleep(5) loc.send_keys(Keys.RETURN) sleep(5) except: print "oops, error" raise count = 0 while count < 50: try: profile_links = xb.els_by_xpath('//a[@class="profile-link"]') print "Got %d links " % len(profile_links) for p in profile_links: u = p.get_attribute('href') v = AngelListProfile.objects.get_or_create(url=u)[0] profiles.add(v) print "We have now %d AngelListProfiles" % AngelListProfile.objects.count( ) more_link = xb.els_by_xpath( '//div[@id="more_pagination_button_people_items"]/div[@class="wrapper"]' ) print "Got %d more links" % len(more_link) if len(more_link) > 0: more_link = more_link[0] more_link.click() print "clicking on the more link" count += 1 sleep(10) except: break pass try: xb.cleanup() except: pass for prof in profiles: # we're going to use the API to find this information fetch_profile_details(prof) # now sleep enough to make sure we're < 1000 API calls/hour sleep(10)
def resolve_redirect_using_xbrowser(url, to_sleep=5): from xpathscraper import xbrowser as xbrowsermod try: with xbrowsermod.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: xb.load_url(url) time.sleep(to_sleep) return xb.driver.current_url except: log.exception('While resolve_redirect_using_xbrowser for %r', url) return url
def print_navigation_links(url): from xpathscraper import xbrowser as xbrowsermod with xbrowsermod.XBrowser(headless_display=False, disable_cleanup=True) as xb: xb.load_url(url) clusters = find_navigation_links_clusters(xb) for cluster in clusters: print '\n' for el in cluster: print el.get_attribute('href')
def extract_hire_me_links(platform_id): pl = models.Platform.objects.get(id=int(platform_id)) try: with platformutils.OpRecorder(operation='extract_hire_me_links', platform=pl) as opr: with xbrowsermod.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xbrowser: ext = HireMeLinksExtractor(pl, xbrowser) lfps = ext.extract_links(to_save=True) opr.data = {'extracted': len(lfps)} except Exception as e: log.exception(e, extra={'platform_id': platform_id})
def sponsorship_from_url(widget_type, url, to_save='0'): try: xb = xbrowsermod.XBrowser() post = debra.models.Posts.objects.filter(url=url)[0] rf = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[widget_type](xb, post) sp = rf.fetch_sponsorship(int(to_save)) print sp except Exception as e: log.exception(e, extra={'widget_type': widget_type, 'url': url, 'to_save': to_save}) return None
def fetch_urls_and_scroll( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY): for url in URLS_TO_FETCH_AND_SCROLL: try: with xbrowser.XBrowser(headless_display=False, extra_js_files=['cachewarming.js']) as xb: xb.load_url(url) xb.execute_jsfun('_CW.scroll') time.sleep(120) except: log.exception('While getting %r', url) continue log.info('Fetched %r successfully', url)
def visit_page(page_url): log.info('* Opening {} with Selenium...'.format(page_url)) with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: xb.driver.set_page_load_timeout(60) xb.driver.set_script_timeout(60) xb.driver.implicitly_wait(10) try: xb.load_url(page_url) except: send_admin_email_via_mailsnake( "'influencer_tracking_verification' Selenium exception for PostAnalytics={} (url={})" .format(pa.id, page_url), '<br />'.join(traceback.format_exc().splitlines()))
def extract_product_urls(self, url): try: with xbrowser.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xb: xb.load_url(url) anchors = WebDriverWait(xb.driver, 10).until( lambda _: xb.els_by_xpath('//div[@class="hoverflow"]//a')) anchors = [a for a in anchors if a.get_attribute('href') and \ utils.domain_from_url(a.get_attribute('href')) == 'rstyle.me'] urls = utils.unique_sameorder( a.get_attribute('href') for a in anchors) return urls except Exception as e: log.exception(e, extra={'url': url}) return None
def detect_gender(product_url): """Returns 'men', 'women' or 'unknown' """ log.info('Detecting gender for %r', product_url) texts = [] with xbrowsermod.XBrowser(url=product_url, headless_display=False, disable_cleanup=True) as xbrowser: url = xbrowser.driver.current_url log.info('Current url: %r', url) texts.append(url) scraper = scrapermod.Scraper(xbrowser) name_srs = scraper.get_name_xpaths() evaluator = scrapingresults.ResultEvaluator(scraper) name = evaluator.compute_values(name_srs[0], 'name') log.info('Found name: %r', name) texts.append(name) title = xbrowser.driver.title log.info('Found title: %r', title) texts.append(title) description_els = xbrowser.driver.find_elements_by_xpath('//meta[@name="description"]') if description_els: description = description_els[0].get_attribute('content') log.info('Found description: %r', description) texts.append(description) keywords_els = xbrowser.driver.find_elements_by_xpath('//meta[@name="keywords"]') if keywords_els: keywords = keywords_els[0].get_attribute('content') log.info('Found keywords: %r', keywords) texts.append(keywords) kcs = [] for text in texts: kc = count(text) log.info('%r %r', kc, text) kcs.append(kc) res = sum_keyword_counts(kcs) log.info('Result: %s', res) if res.men > res.women: return 'men' if res.men == res.women: return 'unknown' return 'women'
def _contains_checkout_or_addtocart(self, url): try: with xbrowser.XBrowser(url=url, headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xb: scraper = scrapermod.Scraper(xb) if scraper.get_checkoutbutton_xpaths(): log.info('%r contains checkoutbutton', url) return True if scraper.get_addtocart_xpaths(): log.info('%r contains addtocart', url) return True log.info('%r does not contain addtocart or checkoutbutton', url) return False except Exception as e: log.exception(e, exc_info=1, extra={'url': url})
def collect_urls_from_google(query, pages): log.info('Collecting results for query %r', query) urls = [] try: with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: g = GoogleScraper(xb) for page_no, results in enumerate(g.search(query, pages)): log.info('%d results from page %d', len(results), page_no) urls.extend(results) time.sleep(random.randrange(1, 5)) except Exception as e: log.exception( 'While collecting urls, returning what is collected so far: %s' % e, extra={'query': query}) log.info('Total results for query %r: %d', query, len(urls)) return urls
def get_twitter_profiles_with_bio(bio_search_query, page=0): full_query = ( "site:twitter.com bio:*%s* " "-inurl:status " "-inurl:hashtag " "-inurl:lists " "-inurl:blog.twitter.com " "-intitle:google" ) % bio_search_query with xbrowser.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: searcher = GoogleSearcher(xb) searcher.search(full_query) if page > 0: searcher.goto_page(page) results = searcher.get_current_results() twitter_profiles = [twitter_utils.screen_name_for_url(result) for result in results] return [profile for profile in twitter_profiles if profile is not None]
def search_for_sponsorship(self, post_id): res = [] try: post = debra.models.Posts.objects.get(id=post_id) with platformutils.OpRecorder(operation='search_for_sponsorship', post=post) as opr: with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb: for f_cls in SPONSORSHIP_FETCHER_CLASSES: f = f_cls(xb, post) try: fres = f.fetch_sponsorship(True) if fres is not None: res.append(fres) detect_sidebar_sponsorships(fres) except: log.exception('While search_for_sponsorship') except SoftTimeLimitExceeded as exc: self.retry(exc=exc) return res
def handle_blog(blog): platform = _find_platform(blog['blog_url']) if platform is None: models.OperationStatus.inc('custom_blog', blog['blog_url'], 'init_platform', 'notfound', None) return ft = models.FetcherTask.objects.create( platform=platform, started=datetime.datetime.now(), server_ip=utils.get_ip_address(), process_pid=str(os.getpid()), policy_name='custom_blog', ) xb = None counts = {} try: xb = xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, width=1200, height=800) log.info('Processing blog %r', blog) counts = _do_handle_blog(blog, xb) except Exception as e: log.exception('While processing %r', blog) try: db.close_connection() except: log.exception('While resetting connection') models.OperationStatus.inc('custom_blog', blog.get('blog_url'), 'processing', 'exception', e.__class__.__name__) pass finally: if xb is not None: try: xb.cleanup() except: log.exception('While xb.cleanup(), ignoring') ft.duration = (datetime.datetime.now() - ft.started).total_seconds() ft.posts_saved = counts.get('posts_saved') ft.pis_saved = counts.get('pis_saved') ft.save()
def get_product_urls(post_id): """ This method fetches the product URLs contained inside the widgets """ post = debra.models.Posts.objects.get(id=post_id) if post.platform.is_social: log.debug("Post %r is from social platform, so no need to search for iframe based widgets" % post) return set() search_for_sponsorship(post_id) #widgets = debra.models.SponsorshipInfo.objects.filter(post__id=post_id) widgets = debra.models.SponsorshipInfo.objects.filter(post__id=post_id, widget_type__in=['rstyle', 'shopstyle', 'shopstyle2']) widgets = widgets.exclude(sidebar=True) url_set = set() if widgets.exists(): for w in widgets: xb = None try: xb = xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) xb.load_url(w.url) url_xpath = '//a' if w.base_xpath: url_xpath = w.base_xpath + url_xpath log.info('Using url xpath %r for widget %r', url_xpath, w) url_elements = xb.els_by_xpath(url_xpath) for u in url_elements: if u.get_attribute('href'): url_set.add(u.get_attribute('href')) xb.cleanup() except Exception as e: log.exception("Exception occurred while parsing product url: %s" % e, extra={'post_id': post_id, 'url': w.url}) if xb: try: xb.cleanup() except Exception as e: log.exception(e) return url_set
def search_infs_by_giveaways(pages=20): brands = models.Brands.objects.filter(supported=True).order_by('id')[12:13] for brand in brands: for q in GOOGLE_QUERIES: q = q.format(brand=brand) log.info('Searching: %r', q) try: with xbrowser.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xb: g = GoogleScraper(xb) it = g.search(q, pages) for results in it: for url in results: try: if utils.domain_from_url( url ) in import_from_blog_post.exclude_domains_set: log.warn('%r is blacklisted', url) continue dups = models.Influencer.find_duplicates(url) log.info('%r dups: %s', url, dups) if not dups: log.info('YES_CREATE %r', url) new_inf = helpers.create_influencer_and_blog_platform( url, 'google', platform_name_fallback=True) log.info('Created influencer: %r', new_inf) else: log.info('NO_CREATE %r', url) except: log.exception( 'While processing url %r, skipping', url) except Exception as e: log.exception('For brand %r got exception: %s' % (brand, e), extra={'pages': pages})
def extract_common_links_from_platform(platform_id): pl = models.Platform.objects.get(id=int(platform_id)) with platformutils.OpRecorder( operation='extract_common_links_from_platform', platform=pl) as opr: old_links_q = pl.sourcelink_set.filter(kind__startswith='common') | \ pl.sourcelink_set.filter(kind__startswith='navigation') log.info('Deleting %d old links', old_links_q.count()) old_links_q.delete() lfps = [] ext = CommonLinksExtractor(pl) lfps += ext.extract_links(to_save=True) try: with xbrowsermod.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xbrowser: xbrowser.load_url(pl.url) ext = NavigationLinksExtractor(pl, xbrowser) lfps += ext.extract_links(to_save=True) except Exception as e: log.exception(e, extra={'platform_id': platform_id}) opr.data = {'extracted': len(lfps)}
def _do_fetch_posts(self, max_pages=None): res = [] try: with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: videos_list_url = 'https://www.youtube.com/{0}/videos'.format( self.platform.validated_handle) xb.load_url(videos_list_url) while True: # potentially infinite loop? no_break = False try: button = WebDriverWait(xb.driver, 4).until( expected_conditions.presence_of_element_located( (By.CLASS_NAME, 'load-more-button'))) button.click() no_break = True continue finally: if not no_break: break urls = [ el.get_attribute('href') for el in xb.execute_jsfun( '_XPS.evaluateXPath', '//a[contains(@href, "watch?v=")]') ] #'//h3[@class="yt-lockup-title"]/a')] def count_comments(post, iframe_src): r = requests.get(iframe_src, verify=False) tree = lxml.html.fromstring(r.content) raw_comments_count = tree.xpath( '//div[@class="DJa"]/strong')[0].tail.strip() post.ext_num_comments = int( raw_comments_count[1:-1] ) # chopping off the parenthesis post.has_comments = True for url in set(urls): if not self.policy.should_continue_fetching(self): break xb.load_url(url) description_button = xb.execute_jsfun( '_XPS.evaluateXPath', '//button[contains(@class, "yt-uix-expander-collapsed-body")]' ) try: if description_button and len(description_button) > 0: description_button[0].click( ) # expanding the description except: pass video_id = xb.execute_jsfun( '_XPS.evaluateXPath', '//meta[@itemprop="videoId"]')[0].get_attribute( 'content') url = 'https://youtube.com/watch?v=' + video_id previously_saved = list( Posts.objects.filter(url=url, platform=self.platform)) if previously_saved: if self.should_update_old_posts(): log.debug( 'Updating existing post for url {}'.format( url)) post = previously_saved[0] else: self._inc('posts_skipped') log.debug( 'Skipping already saved post with url {}'. format(url)) if not self.test_run: continue else: log.debug('Creating new post for url {}'.format(url)) post = Posts( url=url, platform=self.platform, influencer=self.platform.influencer, show_on_search=self.platform.influencer. show_on_search, ) post.title = xb.execute_jsfun( '_XPS.evaluateXPath', '//*[@id="watch-headline-title"]//span')[0].text # post.content = xb.execute_jsfun('_XPS.evaluateXPath', '//meta[@itemprop="description"]')[0].get_attribute('content') post.impressions = deformat_int( xb.execute_jsfun( '_XPS.evaluateXPath', '//div[@class="watch-view-count"]') [0].text.split()[0]) post.post_image = xb.execute_jsfun( '_XPS.evaluateXPath', '//link[@itemprop="thumbnailUrl"]' )[0].get_attribute('href') post.post_image_width = int( xb.execute_jsfun('_XPS.evaluateXPath', '//meta[@itemprop="width"]') [0].get_attribute('content')) post.post_image_height = int( xb.execute_jsfun('_XPS.evaluateXPath', '//meta[@itemprop="height"]') [0].get_attribute('content')) post.content = xb.execute_jsfun( '_XPS.evaluateXPath', '//p[@id="eow-description"]')[0].text create_date_str = xb.execute_jsfun( '_XPS.evaluateXPath', '//div[@id="watch-uploader-info"]')[0].text x = create_date_str.find('Published on') if x >= 0: x = x + len('Published on ') create_date_str = create_date_str[x:] create_date = dateutil.parser.parse( create_date_str) post.create_date = create_date try: iframe = WebDriverWait(xb.driver, 10).until( expected_conditions.presence_of_element_located( (By.TAG_NAME, 'iframe'))) iframe_src = iframe.get_attribute('src') for i in range(3): try: count_comments(post, iframe_src) except: pass else: break finally: pass self.save_post(post) res.append(post) except Exception as e: log.exception(e) self.fetch_post_interactions(res) return res
def perform_feed(self, tag, num_pages, category, pipeline_class=None, **kwargs): """ This scrapes the instagram tags page for a given tag blog_discovery.hashtags[category] = {list of tags}. """ with OpRecorder('instagram_crawl_feed_for_tag'): from xpathscraper import xbrowser from django.conf import settings page_count = 0 image_urls = set() old_image_urls_count = 0 log.info("Starting scraping for tag %r" % tag) with xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb: url = 'https://instagram.com/explore/tags/%s/' % tag xb.load_url(url) time.sleep(2) # checking the number of posts if it is already in cache posts_qty = None posts_qty_nodes = xb.driver.find_elements_by_xpath( '//header/span/span[@class]') if len(posts_qty_nodes) > 0: try: posts_qty = posts_qty_nodes[0].text posts_qty = int(posts_qty.strip().replace(',', '')) cached_posts_qty = cache.get('instagram_tag__%s' % tag) if cached_posts_qty is not None and ( posts_qty - int(cached_posts_qty)) <= 100: log.info( 'Cached posts quantity is %s, now it is %s, ' 'too few new posts - skipping this feed.' % (cached_posts_qty, posts_qty)) return else: log.info( 'Cached posts quantity is %s, now it is %s, performing this feed.' % (cached_posts_qty, posts_qty)) except ValueError: log.error( 'Could not parse posts quantity to number: %s, please check format' % posts_qty) else: log.info( 'No posts quantity node detected, possible Instagram page HTML structure changed.' ) # scroll to the bottom before we can find the 'load more pages' button xb.driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") while page_count < num_pages: # find all images on the page so far and add them to our set try: # images = xb.driver.find_elements_by_xpath('//div[contains(@class, "PostsGrid__root")]//a') # Instagram structure changed images = xb.driver.find_elements_by_xpath( '//article//a') except: page_count = num_pages continue all_image_urls = set() for i in images: all_image_urls.add(i.get_attribute('href')) new_image_urls = all_image_urls - image_urls image_urls = all_image_urls if len(image_urls) == old_image_urls_count: page_count = num_pages continue old_image_urls_count = len(image_urls) print( "new images: %d so far we have %d image urls for tag %r" % (len(new_image_urls), len(image_urls), tag)) for i in new_image_urls: try: crawler_task.apply_async( kwargs={ 'klass_name': 'CreatorByInstagramHashtags', 'task_type': 'create_profile', 'url': i, 'tag': tag, 'category': category, 'pipeline_class': pipeline_class }, # Queue where tasks to create new profiles for separate posts in feed are put queue='scrape_instagram_posts_new', ) except: print("some error for %s" % i) pass # find the next page button # el = xb.driver.find_elements_by_xpath('//div[contains(@class, "moreLoadingIndicator")]//a') el = xb.driver.find_elements_by_xpath( '//a[contains(text(), "Load more")]') if page_count == 0 and len(el) > 0: e = el[0] e.click() log.info( "Found next page button for page %s successfully, clicking and waiting." % page_count) else: log.info( "'Load More Pics' button not found... returning.") #page_count = num_pages # scroll to the bottom before we can find the 'load more pages' button xb.driver.execute_script("window.scrollTo(0, 50);") xb.driver.execute_script( "window.scrollTo(0, 1000000);") time.sleep(3) page_count += 1 # caching post quantity for this tag if tag is not None and isinstance(posts_qty, int): cache.set('instagram_tag__%s' % tag, posts_qty)
def fetch_blog_posts_date(url): """ Searches for Post's Date in current blog posts's url. :param url: - url of the post :return: fetched datetime as string, method_name as string, title of the page """ result = { 'status_code': None, 'title': None, 'date_published': None, 'description': None } if not url: return result try: # print('performing url: %s' % url) response = requests.get(url, timeout=10, headers=requests_headers) result['status_code'] = response.status_code if response.status_code >= 400: result['description'] = 'http_error' return result try: page = fromstring( bytes( response.content.decode("UTF-8", "ignore").encode("UTF-8"))) except: result['description'] = 'xml_malformed' return result title = '' # getting title titles = page.xpath("//meta[@property='og:title']/@content") if len(titles) > 0: title = titles[0] # First we getting title from /feed page if it exists. If not, we try to fetch /post/comments page. # If it does not exists also, we take title from the current page. if len(title) == 0: try: response = requests.get( "%s%sfeed" % (url, '' if url.endswith('/') else '/'), timeout=5, headers=requests_headers) if response.status_code == 200: pg = fromstring(response.content) titles = pg.xpath('//title/text()') if len(titles) > 0 and titles[0].startswith( 'Comments on: '): title = titles[0].replace('Comments on: ', '', 1) except Timeout: pass if len(title) == 0: comments_default_urls = page.xpath( "//link[re:test(@href, 'feeds\/\d+\/comments\/default')]/@href", namespaces={'re': 'http://exslt.org/regular-expressions'}) if len(comments_default_urls) > 0: try: response = requests.get(comments_default_urls[0], timeout=5, headers=requests_headers) if response.status_code == 200: pg = fromstring(response.content) titles = pg.xpath('//feed/title/text()') if len(titles) > 0 and titles[0].startswith( 'Comments on '): title = titles[0].replace('Comments on ', '', 1) except Timeout: pass if len(title) == 0: page_title = page.xpath("//title/text()") # title = title[0].encode('ascii', 'ignore') if title else '' if page_title is not None and len(page_title) > 0: title = page_title[0] result['title'] = ' '.join(title.split()) # 1. Fetching metas and fields where date format is standard meta_published_time = page.xpath( "//meta[@property='article:published_time']/@content") meta_sailthru_time = page.xpath( "//meta[@name='sailthru.date']/@content") time_datepublished = page.xpath( "//time[@itemprop='datePublished']/@datetime") abbr_datepublished = page.xpath( "//abbr[@itemprop='datePublished']/@title") # recheck time_pubdate = page.xpath("//time[@pubdate and @datetime]/@datetime") # this one needs to be checked additionally time_datetime = page.xpath("//time[@datetime]/@datetime") if meta_published_time: result['date_published'] = dateutil.parser.parse( meta_published_time[0]) result['description'] = 'meta_published_time' return result elif meta_sailthru_time: result['date_published'] = dateutil.parser.parse( meta_sailthru_time[0]) result['description'] = 'meta_sailthru_time' return result elif time_datepublished: result['date_published'] = dateutil.parser.parse( time_datepublished[0]) result['description'] = 'time_datepublished' return result elif time_pubdate: result['date_published'] = dateutil.parser.parse(time_pubdate[0]) result['description'] = 'time_pubdate' return result elif abbr_datepublished: result['date_published'] = dateutil.parser.parse( abbr_datepublished[0]) result['description'] = 'abbr_datepublished' return result elif time_datetime: result['date_published'] = dateutil.parser.parse(time_datetime[0]) result['description'] = 'time_datetime' return result # 2. Finding tags containing dates by regexps # common for blogspot and derivatives json_datepublished = page.xpath( ".//*[re:test(text(), '(?i)\"datePublished\"\s*:\s*\"[\d\w\:-]+\"')]/text()", namespaces={'re': 'http://exslt.org/regular-expressions'}) if json_datepublished: for txt in json_datepublished: datetime_txt_part = re.findall( r'(?i)\"datePublished\"\s*:\s*\"[\d\w\:-]+\"', txt) if datetime_txt_part and len(datetime_txt_part) > 0: result['date_published'] = dateutil.parser.parse( datetime_txt_part[0].split(':')[1]) result['description'] = 'json_datepublished' return result nodes_with_date = page.xpath( ".//div[@class='date-outer']//h2[@class='date-header']/text()") nodes_with_date = nodes_with_date + page.xpath( ".//p[@class='blog-date']//span[@class='date-text']/text()") nodes_with_date = nodes_with_date + page.xpath( ".//div//span[@class='entry-date']/text()") # plain date search by text, like 'January 12, 2015' nodes_with_date = nodes_with_date + page.xpath( ".//*[re:test(text(), '%s')]/text()" % date_expression_01, namespaces={'re': 'http://exslt.org/regular-expressions'}) if nodes_with_date: for txt in nodes_with_date: date_regexp_01_part = re.findall(date_expression_01, txt) if date_regexp_01_part and len(date_regexp_01_part) > 0: result['date_published'] = dateutil.parser.parse( date_regexp_01_part[0]) result['description'] = 'date_regexp_01' return result date_regexp_02_part = re.findall(date_expression_02, txt) if date_regexp_02_part and len(date_regexp_02_part) > 0: result['date_published'] = dateutil.parser.parse( re.sub('[/]', '.', date_regexp_02_part[0])) result['description'] = 'date_regexp_02' return result date_regexp_03_part = re.findall(date_expression_03, txt) if date_regexp_03_part and len(date_regexp_03_part) > 0: result['date_published'] = dateutil.parser.parse( date_regexp_03_part[0]) result['description'] = 'date_regexp_03' return result # if we did not get results this far, trying Google search xb = None try: # Creating webdriver with proxy proxy = random.choice(PROXY_CONFIGS) xb = xbrowser.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, disable_cleanup=False, custom_proxy=proxy['http']) # setting timeouts to xb instance xb.driver.set_page_load_timeout(10) xb.driver.set_script_timeout(10) xb.driver.implicitly_wait(10) sleep(randint(8, 15)) # opening google search page in browser try: xb.load_url('http://google.com?hl=en') except Exception as e: print( 'Exception while loading google.com page and performing scripts: %s' % e) result['description'] = 'google_init_error' return result # finding input element, putting there the given url and press search button time.sleep(2) try: input_field = xb.driver.find_elements_by_xpath( '//input[@title="Search"]') if input_field: input_field = input_field[0] input_field.send_keys(url) time.sleep(1) submit_button = xb.driver.find_elements_by_xpath( '//button[@value="Search"]') if submit_button: submit_button[0].click() except NoSuchElementException as e: print('No input field found on google.com') print(e) result['description'] = 'google_input_locating_error' return result # fetching results from the page time.sleep(1) # Here we are fetching results from google result page for search try: g_divs = xb.driver.find_elements_by_xpath("//div[@class='g']") if g_divs: for g_div in g_divs: url_to_site = g_div.find_element_by_xpath(".//a") if url in url_to_site.get_attribute('href'): possible_date_chunk = g_div.find_element_by_xpath( ".//span[@class='f']") if possible_date_chunk: if 'ago' in possible_date_chunk.text: date_chunks = possible_date_chunk.text.split( ) if len(date_chunks) > 0 and is_integer( date_chunks[0]): result[ 'date_published'] = datetime.utcnow( ) - timedelta( days=int(date_chunks[0])) result['description'] = 'google_search' return result else: date_chunks = "".join( possible_date_chunk.text.replace( "-", "").split(",")).split() if len(date_chunks) >= 3 and is_integer( date_chunks[1]) and is_integer( date_chunks[2]): result[ 'date_published'] = dateutil.parser.parse( possible_date_chunk.text. replace("-", "")) result['description'] = 'google_search' return result break except NoSuchElementException: # did not find elements on a page (divs of results, url in div or date) pass except Exception as e: logger.exception(e, extra={'url': url}) finally: if xb: try: xb.driver.quit() except: pass # PREVIOUS GOOGLE ALGORITHM WITH GOOGLE API # if len(result['title']) > 0: # # q = "" # # parsed_url = urlparse(url) # # q += "site:%s " % parsed_url.netloc # # q += " ".join(result['title'].split()) # # google_response = requests.get( # 'https://ajax.googleapis.com/ajax/services/search/web', # params={ # 'v': '1.0', # 'hl': 'ru', # 'gl': 'ru', # # 'rsz': 8, # # 'safe': 'active', # # 'filter': 0, # 'q': url # }, # timeout=10, # headers={ # 'Referer': 'http://theshelf.com', # 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.81 Chrome/43.0.2357.81 Safari/537.36', # } # ) # # # print('google response status: %s' % google_response.status_code) # if google_response.status_code < 400: # google_json = google_response.json() # print('RESPONSE GOOGLE:') # pprint(google_json) # if google_json: # response_data = google_json.get('responseData', []) # if response_data and 'results' in response_data: # for entry in response_data['results']: # if entry['url'] == url: # possible_date_chunk = entry.get('content', '')[:12] # if 'ago' in possible_date_chunk: # date_chunks = possible_date_chunk.split() # if len(date_chunks) > 0 and is_integer(date_chunks[0]): # result['date_published'] = datetime.utcnow() - timedelta(days=int(date_chunks[0])) # result['description'] = 'google_search' # return result # else: # date_chunks = "".join(possible_date_chunk.split(",")).split() # if len(date_chunks) == 3 and is_integer(date_chunks[1]) and is_integer(date_chunks[2]): # result['date_published'] = dateutil.parser.parse(possible_date_chunk) # result['description'] = 'google_search' # return result result['description'] = 'date_not_found' return result except Timeout: result['description'] = 'status_timeout' return result except ConnectionError: result['description'] = 'connection_error' return result except TooManyRedirects: result['description'] = 'too_many_redirects' return result except DecodeError: result['description'] = 'decode_error' return result
def detect_influencer(self): """ Detects influencer according to the diagram :return: Influencer Id """ self.report_data = dict() # checking if this profile has been performed before (if it has any IC_* actual tags) tags = self.profile.tags.split() if any(t in self.TAGS for t in tags): # looks like this profile was already performed, skipping it return 'already_preformed' # removing existing discovered_influencer if any presents present_influencer = self.profile.discovered_influencer if present_influencer is not None: self.profile.discovered_influencer = None if self.save is True: self.profile.save() # Getting profile's discovered platform ids existing_platform_ids = self.profile.get_platform_ids_detected() non_social_urls = self.profile.get_non_social_urls_detected() log.info('Detecting influencer for InstagramProfile %s ...' % self.profile.id) self.report_data['profile_id'] = self.profile.id self.report_data['existing_platform_ids_qty'] = len( existing_platform_ids) self.report_data['non_social_urls_qty'] = len(non_social_urls) if len(existing_platform_ids) >= 1: log.info('Found %s platform ids' % len(existing_platform_ids)) # There are at least 1 discovered existing platform for this Profile # fetching all platforms except those with url_not_found=True # UPDATE: and then detecting influencers of these platforms. If there is only one influencer - using it active_plats = Platform.objects.filter( id__in=existing_platform_ids).exclude(url_not_found=True) active_influencers_ids = set() for p in active_plats: if p.influencer is not None: active_influencers_ids.add(p.influencer.id) active_influencers_ids = list(active_influencers_ids) self.report_data['active_influencers_ids'] = active_influencers_ids log.info( 'Found %s existing platforms with %s distinctive influencers' % (len(existing_platform_ids), len(active_influencers_ids))) if len(active_influencers_ids) == 1: # Great! Only platforms with one distinctive influencers found, working with it: adding this # influencer to collection, connecting it to InstagramProfile log.info( 'Found 1 influencer (%s), setting IC_one_inf_found tag, setting ' 'influencer to InstagramProfile' % active_influencers_ids[0]) candidate_influencer = Influencer.objects.get( id=active_influencers_ids[0]) if candidate_influencer.blog_url is not None and candidate_influencer.blog_url.startswith( 'http://www.theshelf.com/artificial_blog/'): inf = Influencer.objects.get(id=active_influencers_ids[0]) # TODO: connecting existing artificial influencer? self.profile.discovered_influencer = candidate_influencer if self.save is True: self.profile.save() self.add_influencer_to_discovered_collection( candidate_influencer) self.profile.append_mutual_exclusive_tag( 'IC_one_artificial_inf_found', self.TAGS + self.obsolete_tags) self.report_data[ 'result'] = 'One existing influencer found (artificial/osos): %s (osos: %s / sos: %s)' % ( active_influencers_ids[0], inf.old_show_on_search, inf.show_on_search, ) return 'IC_one_artificial_inf_found' else: self.profile.discovered_influencer = candidate_influencer if self.save is True: self.profile.save() self.add_influencer_to_discovered_collection( candidate_influencer) self.profile.append_mutual_exclusive_tag( 'IC_one_inf_found', self.TAGS + self.obsolete_tags) self.report_data['result'] = 'One existing influencer found and set to ' \ 'profile (non-artificial, non-osos): %s (osos: %s / sos: %s)' % ( active_influencers_ids[0], candidate_influencer.old_show_on_search, candidate_influencer.show_on_search, ) return 'IC_one_inf_found' elif len(active_influencers_ids) > 1: # We discovered more than one active platforms with more than one distinctive influencers. log.info( 'Found more than 1 platform with more than 1 distinctive ' 'Influencers, setting tag IC_many_plats_found') # self.profile.append_mutual_exclusive_tag('IC_many_infs_found', self.TAGS) infs = Influencer.objects.filter( id__in=active_influencers_ids, old_show_on_search=True).exclude(blacklisted=True) if infs.count() == 0: # None found, we pick the best _select_influencer_to_stay(), # connect to the profile and add to the collection active_infs = Influencer.objects.filter( id__in=active_influencers_ids) best_one = active_infs[0]._select_influencer_to_stay( list(active_infs)) self.profile.discovered_influencer = best_one if self.save is True: self.profile.save() # self.add_influencer_to_discovered_collection(best_one) self.profile.append_mutual_exclusive_tag( 'IC_best_from_several', self.TAGS + self.obsolete_tags) several_infs = [ "%s (osos: %s / sos: %s)" % (inf.id, inf.old_show_on_search, inf.show_on_search) for inf in active_infs ] self.report_data['result'] = 'Several existing influencers found (no osos=True): %s , ' \ 'taken best of them: %s (osos: %s / sos: %s)' % ( several_infs, best_one.id, best_one.old_show_on_search, best_one.show_on_search ) return 'IC_best_from_several' elif infs.count() == 1: # One Influencer with old_show_on_search=True found, using it candidate_influencer = infs[0] self.profile.discovered_influencer = candidate_influencer if self.save is True: self.profile.save() # self.add_influencer_to_discovered_collection(candidate_influencer) self.profile.append_mutual_exclusive_tag( 'IC_one_from_several', self.TAGS + self.obsolete_tags) several_infs = [ "%s (osos: %s / sos: %s)" % (inf.id, inf.old_show_on_search, inf.show_on_search) for inf in infs ] self.report_data['result'] = 'Several existing influencers found: %s , taken ' \ 'one of them with osos=True: %s (osos: %s / sos: %s)' % ( several_infs, candidate_influencer.id, candidate_influencer.old_show_on_search, candidate_influencer.show_on_search, ) return 'IC_one_from_several' else: # Multiple found - adding these to collection of duplicates if self.save is True: self.add_influencers_to_duplicates_collection( influencers=infs) self.profile.append_mutual_exclusive_tag( 'IC_many_infs_found', self.TAGS + self.obsolete_tags) self.report_data['result'] = 'Several existing influencers found: %s, taken those with osos=True ' \ 'and putting them to duplicates collection.' % [ "%s (osos: %s / sos: %s)" % (inf.id, inf.old_show_on_search, inf.show_on_search) for inf in infs ] return 'IC_many_infs_found' # There are 0 discovered platforms, checking with non-social urls if len(non_social_urls) == 0: # Creating influencer with artificial url, adding it to collection, connecting it to the profile log.info( 'No non-social urls found, creating artificial Influencer and adding it to the profile' ) count_str = '%s' % (int(time.time())) blog_url = 'http://www.theshelf.com/artificial_blog/%s.html' % count_str inf = create_influencer_and_blog_platform( blog_url, influencer_source='discovered_via_instagram', to_save=True, platform_name_fallback=True) self.profile.discovered_influencer = inf if self.save is True: self.profile.save() # TODO: Should we create here an instagram platform too? self.add_influencer_to_discovered_collection(inf) self.profile.append_mutual_exclusive_tag( 'IC_artificial_inf_created', self.TAGS + self.obsolete_tags) log.info('Adding IC_artificial_inf_created tag') self.report_data['result'] = 'No social/non-social platforms found - creating ' \ 'artificial Influencer: %s (osos: %s / sos: %s).' % (inf.id, inf.old_show_on_search, inf.show_on_search) return 'IC_artificial_inf_created' else: # There are some non-social urls -- checking if there are unique non-social urls # Special shortcut: if non-social urls contain liketoknow.it url. If this url is found, then using it as a # blog url for this future influencer from platformdatafetcher.producturlsextractor import get_blog_url_from_liketoknowit # NEW logic to check for bloggy urls log.info( '%s non-social urls found: %s, trying to find unique root domains' % (len(non_social_urls), non_social_urls)) blog_urls_found = [] from platformdatafetcher.platformextractor import collect_social_urls_from_blog_url, \ substitute_instagram_post_urls # detecting if any of non-social urls are blogs with xbrowsermod.XBrowser( headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True, disable_cleanup=False, timeout=60) as xb: # social urls chunks, we need to prepare social urls into detectable chunks like 'www-less domain/path' social_chunks = [] for url in self.profile.get_social_urls_detected(): parsed = urlparse(url) chunk = '%s%s' % (parsed.netloc[4:] if parsed.netloc.startswith('www.') else parsed.netloc, parsed.path) chunk = chunk.strip('/') if chunk not in social_chunks: social_chunks.append(chunk) log.info('Social url fragments for searching: %s' % social_chunks) # detecting if any found socials in there non_social_urls = self.profile.get_non_social_urls_detected() unique_root_domains = self.get_unique_root_domains( non_social_urls) for k in unique_root_domains.keys(): non_social_url_start = unique_root_domains[k][0] # checking if this url is a good liketoknow.it url and blog url can be retrieved: parsed = urlparse(non_social_url_start) # checking if domain is liketoknow.it if parsed.netloc.lower().strip().replace('www.', '', 1) == 'liketoknow.it' and \ parsed.path.lower().strip('/').strip() not in ['', 'login']: log.info( 'Liketoknow.it url detected: %r , trying to get its blog url' % non_social_url_start) # looks like it is a good liketoknow.it url, getting blog url blog_url = get_blog_url_from_liketoknowit( non_social_url_start, xb) if blog_url is not None: log.info( 'Blog url detected successfully: %r , considering it a good blog url' % blog_url) # adding it to blog_urls detected if blog_url not in blog_urls_found: blog_urls_found.append(blog_url) else: log.info('Blog url %r is already detected' % blog_url) else: log.info('Blog url was not detected') else: is_blog_url, non_social_url = self.is_url_a_blog( non_social_url_start, self.profile) log.info('Checking if %r is a blog:' % non_social_url) if is_blog_url is True and non_social_url is not None: log.info('Perfect, %r is a blog' % non_social_url) socials_detected = [] found_soc_urls = defaultdict(list) collect_social_urls_from_blog_url( xb=xb, by_pname=found_soc_urls, platform=None, non_social_url=non_social_url) substitute_instagram_post_urls(found_soc_urls) log.info('SOCIAL URLS COLLECTED: %s' % found_soc_urls) # if no social urls were collected, we're checking if this non-social url has # social urls in any form with regexps by its content and iframes. if len(found_soc_urls) == 0: scraped_social_urls = collect_any_social_urls( xb=xb, non_social_url=non_social_url) log.info( 'Thorough search found %s candidate social urls ' 'to check' % len(scraped_social_urls)) found_soc_urls[ 'Bruteforce'] = scraped_social_urls # found_socials is in format {'Instagram': ['url1', 'url2',...], 'Facebook': [...], ...} for social_url_lst in found_soc_urls.values(): for social_url in social_url_lst: if any([ sc.lower() in social_url.lower() for sc in social_chunks ]): # we found one of social chunks in detected social url if social_url not in socials_detected: socials_detected.append(social_url) log.info('Positively matched social urls: %s' % socials_detected) # if we found some matching social urls - then it is a blog url, TA-DAAAA! if len(socials_detected) > 0: if non_social_url not in blog_urls_found: # TODO: should we use here self.is_url_a_blog(url, self.profile) for extra blog check? blog_urls_found.append(non_social_url) log.info( 'Considering url %r to be a blog url for this profile' % non_social_url) else: log.info( 'Url %r considered as non-blog url or is unreachable' % non_social_url_start) if len(blog_urls_found) == 1: # we found 1 blog url log.info('Looks like it is a new single blog url!') self.report_data['unique_root_domain_is_blog'] = True # Here we have found 0 existing platforms, but we detected that a single non-social url # is a BLOG. So we create a blog platform with this url, creating an influencer, connecting # this blog platform to this influencer and connecting the influencer to the profile. # creating new blog platform inf = create_influencer_and_blog_platform( blog_url=blog_urls_found[0], influencer_source='ic_from_insta_profile', to_save=self.save, platform_name_fallback=True) self.profile.discovered_influencer = inf log.info('A new influencer has been created: %s' % inf) if self.save is True: self.profile.save() self.add_influencer_to_discovered_collection(inf) self.profile.append_mutual_exclusive_tag( 'IC_new_blog_new_inf', self.TAGS + self.obsolete_tags) self.report_data['result'] = 'New influencer %s (osos: %s / sos: %s) created by single ' \ 'non-social blog platform' % (inf.id, inf.old_show_on_search, inf.show_on_search) return 'IC_new_blog_new_inf' elif len(blog_urls_found) == 0: # if none found to be a blog # => check if the length of the url > 20 chars (typically identifies as a # product) => then this profile needs to be fetched again later # => create a new field "date_to_fetch_later" in InstagramProfile and update this field # with today+10 days later # => need to create a celery task that checks if today is the day when they should be # re-fetched and then clears up this date_to_fetch_later to None # => after fetching the profile, compare the old url and description with new one, check # if it's different, then pass it to the same pipeline as it was originally part of log.info('No blog urls were detected within non_social_urls') # TODO: what should we do if this already has date_to_fetch_later != None ? long_url = False for non_social_url in non_social_urls: if len(non_social_url) > 20: self.profile.date_to_fetch_later = datetime.now( ) + timedelta(days=10) if self.save is True: self.profile.save() long_url = True break if long_url is True: self.report_data[ 'result'] = 'No blog urls were found, retrying in 10 days' return '10_days_later' else: # TODO: What should we do here, should we create an artificial url? if self.save is True: self.profile.append_mutual_exclusive_tag( 'IC_possible_brand', self.TAGS + self.obsolete_tags) self.report_data[ 'result'] = 'Profile considered to be possibly a brand.' return 'IC_possible_brand' else: # TODO: Skipping for now... log.info( 'We found many non-social blog domains, setting IC_many_nonsocial_found tag:' % blog_urls_found) if self.save is True: self.profile.append_mutual_exclusive_tag( 'IC_many_nonsocial_found', self.TAGS + self.obsolete_tags) self.report_data[ 'result'] = 'Multiple unique root domains found, skipped for now' return 'IC_many_nonsocial_found'