Python XBrowser примеры, xpathscraper.xbrowser.XBrowser Python примеры использования

Пример #1

0

Показать файл

def extract_emails_from_platform(platform_id=None,
                                 platform_object=None,
                                 to_save=True,
                                 disable_cleanup=False):
    assert platform_id is not None or platform_object is not None
    pl = models.Platform.objects.get(id=int(platform_id)) \
        if platform_id is not None \
        else platform_object
    try:
        with platformutils.OpRecorder('extract_emails_from_platform',
                                      platform=pl) as opr:
            with xbrowsermod.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    disable_cleanup=disable_cleanup) as xb:
                found_emails = []

                ee1 = FromAboutPagesExtractor(xb, pl)
                ee1.update_influencers_email(to_save=to_save)
                found_emails += ee1.found_emails

                ee2 = FromCommonPostsExtractor(xb, pl)
                ee2.update_influencers_email(to_save=to_save)
                found_emails += ee2.found_emails

                found_emails = filter_emails(found_emails)
                opr.data = {'found_emails': found_emails}
                return found_emails
    except Exception as e:
        log.exception(e,
                      extra={
                          'platform_id': platform_id,
                          'to_save': to_save,
                          'disable_cleanup': disable_cleanup
                      })

Пример #2

0

Показать файл

def get_blog_url_from_liketoknowit(liketoknowit_url=None, xb=None):
    """
    Function to extract user's blog url from her http://liketoknow.it/<username> page.
    :param liketoknowit_url: url to liketoknowit page
    :return: blog url
    """
    def get_the_blog_url(xb, liketoknowit_url):
        xb.load_url(liketoknowit_url)

        anchors = WebDriverWait(
            xb.driver,
            10).until(lambda _: xb.els_by_xpath('//publisher-header//h5//a'))
        anchors = [a for a in anchors if a.get_attribute('href')]
        urls = utils.unique_sameorder(a.get_attribute('href') for a in anchors)

        return urls[0] if len(urls) > 0 else None

    if liketoknowit_url is None:
        return None

    try:
        if xb is None:
            with xbrowser.XBrowser(headless_display=settings.
                                   AUTOCREATE_HEADLESS_DISPLAY) as xb:
                return get_the_blog_url(xb, liketoknowit_url)
        else:
            return get_the_blog_url(xb, liketoknowit_url)

    except Exception as e:
        log.exception(e, extra={'url': liketoknowit_url})
        return None

Пример #3

0

Показать файл

    def _create_scraper(self, product):
        op_recorder = ScraperOpRecorder(product)
        if self.reuse_xbrowser and getattr(xbrowser_storage, 'xbrowser', None):
            log.info('Reusing xbrowser')
            self.xbrowser = xbrowser_storage.xbrowser
        else:
            log.debug('Creating new xbrowser')
            try:
                self.xbrowser = xbrowser.XBrowser(
                    driver=self.driver,
                    headless_display=self.headless_display,
                    auto_refresh=True if self.reuse_xbrowser else False)
            except:
                op_recorder.exception()
                raise
        if self.reuse_xbrowser:
            xbrowser_storage.xbrowser = self.xbrowser
        self.xbrowser.load_url(product.prod_url)
        if self.sleep_after_load:
            log.debug('Sleeping %d seconds before scraping',
                      self.sleep_after_load)
            time.sleep(self.sleep_after_load)
            log.debug('Finished sleeping')

        if self.reuse_xbrowser:
            log.debug('Reusing xbrowser -- forcing a refresh.')
            self.xbrowser.xrefresh()
            if self.sleep_after_load:
                log.debug('Sleeping %d seconds before scraping',
                          self.sleep_after_load)
                time.sleep(self.sleep_after_load)
                log.debug('Finished sleeping')
        self.scraper = scrapermod.Scraper(self.xbrowser, op_recorder)

Пример #4

0

Показать файл

Файл: blogvisitor.py Проект: khsr/django-shelf

def visit_url(url):
    xb = None
    try:
        xb = xbrowser.XBrowser(
            headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY)
        xb.load_url('http://app.theshelf.com/internal/')
        xb.driver.execute_script("""var a = document.createElement('a');
                                    a.href='%s';
                                    a.id = 'blogvisitor_to_click';
                                    a.innerHTML = 'I will click this';
                                    document.body.appendChild(a);""" %
                                 url.replace("'", "\\'"))
        a = xb.driver.find_element_by_id('blogvisitor_to_click')
        a.click()
        xb.driver.back()
    except:
        log.exception('visit_url(url={}) got an exception'.format(url),
                      extra={'url': url})
    finally:
        try:
            if xb:
                xb.cleanup()
        except:
            log.exception(
                'visit_url(url={}) got an exception while xb.cleanup()'.format(
                    url))

Пример #5

0

Показать файл

    def _update_num_following(self):
        try:
            with xbrowser.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True) as xb:
                xb.load_url(
                    'https://www.youtube.com/{0}/channels?flow=grid&view=56'.
                    format(self.platform.validated_handle))

                while True:  # potentially infinite loop?
                    no_break = False
                    try:
                        button = WebDriverWait(xb.driver, 10).until(
                            expected_conditions.presence_of_element_located(
                                (By.CLASS_NAME, 'load-more-button')))
                        button.click()
                        no_break = True
                        continue
                    finally:
                        if not no_break:
                            break
                self.platform.num_following = len(
                    xb.execute_jsfun(
                        '_XPS.evaluateXPath',
                        '//li[contains(@class, "channels-content-item")]'))
                self.platform.save()
        except Exception as e:
            log.exception(e)

Пример #6

0

Показать файл

Файл: linkextractor.py Проект: khsr/django-shelf

def extract_links_from_platform_url(platform_id):
    try:
        pl = models.Platform.objects.get(id=int(platform_id))
        xb = xbrowsermod.XBrowser(pl.url)
        le = LinksFromPlatformUrlExtractor(xb, pl)
        le.extract_links()
    except Exception as e:
        log.exception(e, extra={'platform_id': platform_id})

Пример #7

0

Показать файл

 def fetch_for_posts(cls, posts):
     try:
         with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb:
             for post in posts:
                 with platformutils.OpRecorder(operation='{0}_for_post'.format(cls.__name__.lower()), post=post):
                     fetcher = cls(xb, post)
                     yield fetcher.fetch_interactions()
     except Exception as e:
         log.exception(e, extra={'posts_len': len(posts)})

Пример #8

0

Показать файл

Файл: sponsorshipfetcher.py Проект: khsr/django-shelf

def update_single_sponsorship(self, sponsorshipinfo_id):
    try:
        sp = debra.models.SponsorshipInfo.objects.get(id=sponsorshipinfo_id)
        with platformutils.OpRecorder(operation='update_single_sponsorship', post=sp.post) as opr:
            with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
                f = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[sp.widget_type](xb, sp.post)
                si = f.fetch_sponsorship(True)
                detect_sidebar_sponsorships(si)
    except SoftTimeLimitExceeded as exc:
        self.retry(exc=exc)

Пример #9

0

Показать файл

Файл: crawler.py Проект: khsr/django-shelf

def run():
    profiles = set()
    for location in LOCATIONS:
        xb = xbrowsermod.XBrowser(
            headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
            disable_cleanup=False)
        xb.load_url(ANGEL_LIST_INVESTOR_HOME)
        # now find the button to enter location
        print "now trying location %s" % location
        try:
            loc = xb.el_by_xpath('//input[@placeholder="Add Location"]')
            loc.send_keys(location)
            sleep(5)
            loc.send_keys(Keys.RETURN)
            sleep(5)
        except:
            print "oops, error"
            raise

        count = 0
        while count < 50:
            try:
                profile_links = xb.els_by_xpath('//a[@class="profile-link"]')
                print "Got %d links " % len(profile_links)
                for p in profile_links:
                    u = p.get_attribute('href')
                    v = AngelListProfile.objects.get_or_create(url=u)[0]
                    profiles.add(v)
                    print "We have now %d AngelListProfiles" % AngelListProfile.objects.count(
                    )

                more_link = xb.els_by_xpath(
                    '//div[@id="more_pagination_button_people_items"]/div[@class="wrapper"]'
                )
                print "Got %d more links" % len(more_link)
                if len(more_link) > 0:
                    more_link = more_link[0]
                    more_link.click()
                    print "clicking on the more link"

                count += 1
                sleep(10)
            except:
                break
                pass
        try:
            xb.cleanup()
        except:
            pass

    for prof in profiles:
        # we're going to use the API to find this information
        fetch_profile_details(prof)
        # now sleep enough to make sure we're < 1000 API calls/hour
        sleep(10)

Пример #10

0

Показать файл

def resolve_redirect_using_xbrowser(url, to_sleep=5):
    from xpathscraper import xbrowser as xbrowsermod
    try:
        with xbrowsermod.XBrowser(
                headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
            xb.load_url(url)
            time.sleep(to_sleep)
            return xb.driver.current_url
    except:
        log.exception('While resolve_redirect_using_xbrowser for %r', url)
        return url

Пример #11

0

Показать файл

def print_navigation_links(url):
    from xpathscraper import xbrowser as xbrowsermod

    with xbrowsermod.XBrowser(headless_display=False,
                              disable_cleanup=True) as xb:
        xb.load_url(url)
        clusters = find_navigation_links_clusters(xb)
        for cluster in clusters:
            print '\n'
            for el in cluster:
                print el.get_attribute('href')

Пример #12

0

Показать файл

Файл: linkextractor.py Проект: khsr/django-shelf

def extract_hire_me_links(platform_id):
    pl = models.Platform.objects.get(id=int(platform_id))
    try:
        with platformutils.OpRecorder(operation='extract_hire_me_links',
                                      platform=pl) as opr:
            with xbrowsermod.XBrowser(headless_display=settings.
                                      AUTOCREATE_HEADLESS_DISPLAY) as xbrowser:
                ext = HireMeLinksExtractor(pl, xbrowser)
                lfps = ext.extract_links(to_save=True)
                opr.data = {'extracted': len(lfps)}
    except Exception as e:
        log.exception(e, extra={'platform_id': platform_id})

Пример #13

0

Показать файл

Файл: sponsorshipfetcher.py Проект: khsr/django-shelf

def sponsorship_from_url(widget_type, url, to_save='0'):
    try:
        xb = xbrowsermod.XBrowser()
        post = debra.models.Posts.objects.filter(url=url)[0]
        rf = WIDGET_TYPE_TO_SPONSORSHIP_FETCHER_CLASS[widget_type](xb, post)
        sp = rf.fetch_sponsorship(int(to_save))
        print sp
    except Exception as e:
        log.exception(e, extra={'widget_type': widget_type,
                                'url': url,
                                'to_save': to_save})
        return None

Пример #14

0

Показать файл

Файл: cachewarming.py Проект: khsr/django-shelf

def fetch_urls_and_scroll(
        headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY):
    for url in URLS_TO_FETCH_AND_SCROLL:
        try:
            with xbrowser.XBrowser(headless_display=False,
                                   extra_js_files=['cachewarming.js']) as xb:
                xb.load_url(url)
                xb.execute_jsfun('_CW.scroll')
                time.sleep(120)
        except:
            log.exception('While getting %r', url)
            continue
        log.info('Fetched %r successfully', url)

Пример #15

0

Показать файл

 def visit_page(page_url):
     log.info('* Opening {} with Selenium...'.format(page_url))
     with xbrowser.XBrowser(
             headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
         xb.driver.set_page_load_timeout(60)
         xb.driver.set_script_timeout(60)
         xb.driver.implicitly_wait(10)
         try:
             xb.load_url(page_url)
         except:
             send_admin_email_via_mailsnake(
                 "'influencer_tracking_verification' Selenium exception for PostAnalytics={} (url={})"
                 .format(pa.id, page_url),
                 '<br />'.join(traceback.format_exc().splitlines()))

Пример #16

0

Показать файл

 def extract_product_urls(self, url):
     try:
         with xbrowser.XBrowser(headless_display=settings.
                                AUTOCREATE_HEADLESS_DISPLAY) as xb:
             xb.load_url(url)
             anchors = WebDriverWait(xb.driver, 10).until(
                 lambda _: xb.els_by_xpath('//div[@class="hoverflow"]//a'))
             anchors = [a for a in anchors if a.get_attribute('href') and \
                        utils.domain_from_url(a.get_attribute('href')) == 'rstyle.me']
             urls = utils.unique_sameorder(
                 a.get_attribute('href') for a in anchors)
             return urls
     except Exception as e:
         log.exception(e, extra={'url': url})
         return None

Пример #17

0

Показать файл

Файл: genderdetection.py Проект: khsr/django-shelf

def detect_gender(product_url):
    """Returns 'men', 'women' or 'unknown'
    """

    log.info('Detecting gender for %r', product_url)
    texts = []

    with xbrowsermod.XBrowser(url=product_url, headless_display=False, disable_cleanup=True) as xbrowser:
        url = xbrowser.driver.current_url
        log.info('Current url: %r', url)
        texts.append(url)

        scraper = scrapermod.Scraper(xbrowser)
        name_srs = scraper.get_name_xpaths()
        evaluator = scrapingresults.ResultEvaluator(scraper)
        name = evaluator.compute_values(name_srs[0], 'name')
        log.info('Found name: %r', name)
        texts.append(name)

        title = xbrowser.driver.title
        log.info('Found title: %r', title)
        texts.append(title)

        description_els = xbrowser.driver.find_elements_by_xpath('//meta[@name="description"]')
        if description_els:
            description = description_els[0].get_attribute('content')
            log.info('Found description: %r', description)
            texts.append(description)

        keywords_els = xbrowser.driver.find_elements_by_xpath('//meta[@name="keywords"]')
        if keywords_els:
            keywords = keywords_els[0].get_attribute('content')
            log.info('Found keywords: %r', keywords)
            texts.append(keywords)

    kcs = []
    for text in texts:
        kc = count(text)
        log.info('%r %r', kc, text)
        kcs.append(kc)

    res = sum_keyword_counts(kcs)
    log.info('Result: %s', res)
    if res.men > res.women:
        return 'men'
    if res.men == res.women:
        return 'unknown'
    return 'women'

Пример #18

0

Показать файл

 def _contains_checkout_or_addtocart(self, url):
     try:
         with xbrowser.XBrowser(url=url,
                                headless_display=settings.
                                AUTOCREATE_HEADLESS_DISPLAY) as xb:
             scraper = scrapermod.Scraper(xb)
             if scraper.get_checkoutbutton_xpaths():
                 log.info('%r contains checkoutbutton', url)
                 return True
             if scraper.get_addtocart_xpaths():
                 log.info('%r contains addtocart', url)
                 return True
             log.info('%r does not contain addtocart or checkoutbutton',
                      url)
             return False
     except Exception as e:
         log.exception(e, exc_info=1, extra={'url': url})

Пример #19

0

Показать файл

def collect_urls_from_google(query, pages):
    log.info('Collecting results for query %r', query)
    urls = []
    try:
        with xbrowser.XBrowser(
                headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
            g = GoogleScraper(xb)
            for page_no, results in enumerate(g.search(query, pages)):
                log.info('%d results from page %d', len(results), page_no)
                urls.extend(results)
                time.sleep(random.randrange(1, 5))
    except Exception as e:
        log.exception(
            'While collecting urls, returning what is collected so far: %s' %
            e,
            extra={'query': query})
    log.info('Total results for query %r: %d', query, len(urls))
    return urls

Пример #20

0

Показать файл

Файл: google_search.py Проект: khsr/django-shelf

def get_twitter_profiles_with_bio(bio_search_query, page=0):
    full_query = (
        "site:twitter.com bio:*%s* "
        "-inurl:status "
        "-inurl:hashtag "
        "-inurl:lists "
        "-inurl:blog.twitter.com "
        "-intitle:google"
    ) % bio_search_query

    with xbrowser.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY, load_no_images=True) as xb:
        searcher = GoogleSearcher(xb)
        searcher.search(full_query)
        if page > 0:
            searcher.goto_page(page)

        results = searcher.get_current_results()
        twitter_profiles = [twitter_utils.screen_name_for_url(result) for result in results]
        return [profile for profile in twitter_profiles if profile is not None]

Пример #21

0

Показать файл

Файл: sponsorshipfetcher.py Проект: khsr/django-shelf

def search_for_sponsorship(self, post_id):
    res = []
    try:
        post = debra.models.Posts.objects.get(id=post_id)
        with platformutils.OpRecorder(operation='search_for_sponsorship', post=post) as opr:
            with xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY) as xb:
                for f_cls in SPONSORSHIP_FETCHER_CLASSES:
                    f = f_cls(xb, post)
                    try:
                        fres = f.fetch_sponsorship(True)
                        if fres is not None:
                            res.append(fres)
                            detect_sidebar_sponsorships(fres)

                    except:
                        log.exception('While search_for_sponsorship')
    except SoftTimeLimitExceeded as exc:
        self.retry(exc=exc)
    return res

Пример #22

0

Показать файл

Файл: customblogs.py Проект: khsr/django-shelf

def handle_blog(blog):
    platform = _find_platform(blog['blog_url'])
    if platform is None:
        models.OperationStatus.inc('custom_blog', blog['blog_url'],
                                   'init_platform', 'notfound', None)
        return
    ft = models.FetcherTask.objects.create(
        platform=platform,
        started=datetime.datetime.now(),
        server_ip=utils.get_ip_address(),
        process_pid=str(os.getpid()),
        policy_name='custom_blog',
    )
    xb = None
    counts = {}
    try:
        xb = xbrowser.XBrowser(
            headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
            width=1200,
            height=800)
        log.info('Processing blog %r', blog)
        counts = _do_handle_blog(blog, xb)
    except Exception as e:
        log.exception('While processing %r', blog)
        try:
            db.close_connection()
        except:
            log.exception('While resetting connection')
        models.OperationStatus.inc('custom_blog', blog.get('blog_url'),
                                   'processing', 'exception',
                                   e.__class__.__name__)
        pass
    finally:
        if xb is not None:
            try:
                xb.cleanup()
            except:
                log.exception('While xb.cleanup(), ignoring')
    ft.duration = (datetime.datetime.now() - ft.started).total_seconds()
    ft.posts_saved = counts.get('posts_saved')
    ft.pis_saved = counts.get('pis_saved')
    ft.save()

Пример #23

0

Показать файл

Файл: sponsorshipfetcher.py Проект: khsr/django-shelf

def get_product_urls(post_id):
    """
    This method fetches the product URLs contained inside the widgets
    """
    post = debra.models.Posts.objects.get(id=post_id)
    if post.platform.is_social:
        log.debug("Post %r is from social platform, so no need to search for iframe based widgets" % post)
        return set()
    search_for_sponsorship(post_id)
    #widgets = debra.models.SponsorshipInfo.objects.filter(post__id=post_id)
    widgets = debra.models.SponsorshipInfo.objects.filter(post__id=post_id, widget_type__in=['rstyle',
                                                                                             'shopstyle',
                                                                                             'shopstyle2'])
    widgets = widgets.exclude(sidebar=True)
    url_set = set()
    if widgets.exists():
        for w in widgets:
            xb = None
            try:
                xb = xbrowsermod.XBrowser(headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY)
                xb.load_url(w.url)
                url_xpath = '//a'
                if w.base_xpath:
                    url_xpath = w.base_xpath + url_xpath
                log.info('Using url xpath %r for widget %r', url_xpath, w)
                url_elements = xb.els_by_xpath(url_xpath)
                for u in url_elements:
                    if u.get_attribute('href'):
                        url_set.add(u.get_attribute('href'))
                xb.cleanup()
            except Exception as e:
                log.exception("Exception occurred while parsing product url: %s" % e,
                              extra={'post_id': post_id,
                                     'url': w.url})
            if xb:
                try:
                    xb.cleanup()
                except Exception as e:
                    log.exception(e)

    return url_set

Пример #24

0

Показать файл

def search_infs_by_giveaways(pages=20):
    brands = models.Brands.objects.filter(supported=True).order_by('id')[12:13]
    for brand in brands:
        for q in GOOGLE_QUERIES:
            q = q.format(brand=brand)
            log.info('Searching: %r', q)
            try:
                with xbrowser.XBrowser(headless_display=settings.
                                       AUTOCREATE_HEADLESS_DISPLAY) as xb:
                    g = GoogleScraper(xb)
                    it = g.search(q, pages)
                    for results in it:
                        for url in results:
                            try:
                                if utils.domain_from_url(
                                        url
                                ) in import_from_blog_post.exclude_domains_set:
                                    log.warn('%r is blacklisted', url)
                                    continue
                                dups = models.Influencer.find_duplicates(url)
                                log.info('%r dups: %s', url, dups)
                                if not dups:
                                    log.info('YES_CREATE %r', url)
                                    new_inf = helpers.create_influencer_and_blog_platform(
                                        url,
                                        'google',
                                        platform_name_fallback=True)
                                    log.info('Created influencer: %r', new_inf)
                                else:
                                    log.info('NO_CREATE %r', url)
                            except:
                                log.exception(
                                    'While processing url %r, skipping', url)
            except Exception as e:
                log.exception('For brand %r got exception: %s' % (brand, e),
                              extra={'pages': pages})

Пример #25

0

Показать файл

Файл: linkextractor.py Проект: khsr/django-shelf

def extract_common_links_from_platform(platform_id):
    pl = models.Platform.objects.get(id=int(platform_id))
    with platformutils.OpRecorder(
            operation='extract_common_links_from_platform',
            platform=pl) as opr:
        old_links_q = pl.sourcelink_set.filter(kind__startswith='common') | \
                      pl.sourcelink_set.filter(kind__startswith='navigation')
        log.info('Deleting %d old links', old_links_q.count())
        old_links_q.delete()

        lfps = []

        ext = CommonLinksExtractor(pl)
        lfps += ext.extract_links(to_save=True)
        try:
            with xbrowsermod.XBrowser(headless_display=settings.
                                      AUTOCREATE_HEADLESS_DISPLAY) as xbrowser:
                xbrowser.load_url(pl.url)
                ext = NavigationLinksExtractor(pl, xbrowser)
                lfps += ext.extract_links(to_save=True)
        except Exception as e:
            log.exception(e, extra={'platform_id': platform_id})

        opr.data = {'extracted': len(lfps)}

Пример #26

0

Показать файл

    def _do_fetch_posts(self, max_pages=None):
        res = []

        try:
            with xbrowser.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True) as xb:
                videos_list_url = 'https://www.youtube.com/{0}/videos'.format(
                    self.platform.validated_handle)
                xb.load_url(videos_list_url)

                while True:  # potentially infinite loop?
                    no_break = False
                    try:
                        button = WebDriverWait(xb.driver, 4).until(
                            expected_conditions.presence_of_element_located(
                                (By.CLASS_NAME, 'load-more-button')))
                        button.click()
                        no_break = True
                        continue
                    finally:
                        if not no_break:
                            break
                urls = [
                    el.get_attribute('href') for el in xb.execute_jsfun(
                        '_XPS.evaluateXPath',
                        '//a[contains(@href, "watch?v=")]')
                ]  #'//h3[@class="yt-lockup-title"]/a')]

                def count_comments(post, iframe_src):
                    r = requests.get(iframe_src, verify=False)
                    tree = lxml.html.fromstring(r.content)
                    raw_comments_count = tree.xpath(
                        '//div[@class="DJa"]/strong')[0].tail.strip()
                    post.ext_num_comments = int(
                        raw_comments_count[1:-1]
                    )  # chopping off the parenthesis
                    post.has_comments = True

                for url in set(urls):
                    if not self.policy.should_continue_fetching(self):
                        break

                    xb.load_url(url)
                    description_button = xb.execute_jsfun(
                        '_XPS.evaluateXPath',
                        '//button[contains(@class, "yt-uix-expander-collapsed-body")]'
                    )
                    try:
                        if description_button and len(description_button) > 0:
                            description_button[0].click(
                            )  # expanding the description
                    except:
                        pass
                    video_id = xb.execute_jsfun(
                        '_XPS.evaluateXPath',
                        '//meta[@itemprop="videoId"]')[0].get_attribute(
                            'content')
                    url = 'https://youtube.com/watch?v=' + video_id

                    previously_saved = list(
                        Posts.objects.filter(url=url, platform=self.platform))
                    if previously_saved:
                        if self.should_update_old_posts():
                            log.debug(
                                'Updating existing post for url {}'.format(
                                    url))
                            post = previously_saved[0]
                        else:
                            self._inc('posts_skipped')
                            log.debug(
                                'Skipping already saved post with url {}'.
                                format(url))
                            if not self.test_run:
                                continue
                    else:
                        log.debug('Creating new post for url {}'.format(url))

                        post = Posts(
                            url=url,
                            platform=self.platform,
                            influencer=self.platform.influencer,
                            show_on_search=self.platform.influencer.
                            show_on_search,
                        )
                        post.title = xb.execute_jsfun(
                            '_XPS.evaluateXPath',
                            '//*[@id="watch-headline-title"]//span')[0].text
                        # post.content = xb.execute_jsfun('_XPS.evaluateXPath', '//meta[@itemprop="description"]')[0].get_attribute('content')
                        post.impressions = deformat_int(
                            xb.execute_jsfun(
                                '_XPS.evaluateXPath',
                                '//div[@class="watch-view-count"]')
                            [0].text.split()[0])
                        post.post_image = xb.execute_jsfun(
                            '_XPS.evaluateXPath',
                            '//link[@itemprop="thumbnailUrl"]'
                        )[0].get_attribute('href')
                        post.post_image_width = int(
                            xb.execute_jsfun('_XPS.evaluateXPath',
                                             '//meta[@itemprop="width"]')
                            [0].get_attribute('content'))
                        post.post_image_height = int(
                            xb.execute_jsfun('_XPS.evaluateXPath',
                                             '//meta[@itemprop="height"]')
                            [0].get_attribute('content'))
                        post.content = xb.execute_jsfun(
                            '_XPS.evaluateXPath',
                            '//p[@id="eow-description"]')[0].text
                        create_date_str = xb.execute_jsfun(
                            '_XPS.evaluateXPath',
                            '//div[@id="watch-uploader-info"]')[0].text
                        x = create_date_str.find('Published on')
                        if x >= 0:
                            x = x + len('Published on ')
                            create_date_str = create_date_str[x:]
                            create_date = dateutil.parser.parse(
                                create_date_str)
                            post.create_date = create_date
                    try:
                        iframe = WebDriverWait(xb.driver, 10).until(
                            expected_conditions.presence_of_element_located(
                                (By.TAG_NAME, 'iframe')))
                        iframe_src = iframe.get_attribute('src')
                        for i in range(3):
                            try:
                                count_comments(post, iframe_src)
                            except:
                                pass
                            else:
                                break
                    finally:
                        pass

                    self.save_post(post)
                    res.append(post)
        except Exception as e:
            log.exception(e)

        self.fetch_post_interactions(res)
        return res

Пример #27

0

Показать файл

Файл: creators.py Проект: khsr/django-shelf

    def perform_feed(self,
                     tag,
                     num_pages,
                     category,
                     pipeline_class=None,
                     **kwargs):
        """
        This scrapes the instagram tags page for a given tag
        blog_discovery.hashtags[category] = {list of tags}.
        """
        with OpRecorder('instagram_crawl_feed_for_tag'):
            from xpathscraper import xbrowser
            from django.conf import settings
            page_count = 0
            image_urls = set()
            old_image_urls_count = 0
            log.info("Starting scraping for tag %r" % tag)
            with xbrowser.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True) as xb:
                url = 'https://instagram.com/explore/tags/%s/' % tag
                xb.load_url(url)
                time.sleep(2)

                # checking the number of posts if it is already in cache
                posts_qty = None
                posts_qty_nodes = xb.driver.find_elements_by_xpath(
                    '//header/span/span[@class]')
                if len(posts_qty_nodes) > 0:
                    try:
                        posts_qty = posts_qty_nodes[0].text
                        posts_qty = int(posts_qty.strip().replace(',', ''))
                        cached_posts_qty = cache.get('instagram_tag__%s' % tag)
                        if cached_posts_qty is not None and (
                                posts_qty - int(cached_posts_qty)) <= 100:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, '
                                'too few new posts - skipping this feed.' %
                                (cached_posts_qty, posts_qty))
                            return
                        else:
                            log.info(
                                'Cached posts quantity is %s, now it is %s, performing this feed.'
                                % (cached_posts_qty, posts_qty))
                    except ValueError:
                        log.error(
                            'Could not parse posts quantity to number: %s, please check format'
                            % posts_qty)
                else:
                    log.info(
                        'No posts quantity node detected, possible Instagram page HTML structure changed.'
                    )

                # scroll to the bottom before we can find the 'load more pages' button
                xb.driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                while page_count < num_pages:
                    # find all images on the page so far and add them to our set
                    try:
                        # images = xb.driver.find_elements_by_xpath('//div[contains(@class, "PostsGrid__root")]//a')
                        # Instagram structure changed
                        images = xb.driver.find_elements_by_xpath(
                            '//article//a')
                    except:
                        page_count = num_pages
                        continue
                    all_image_urls = set()
                    for i in images:
                        all_image_urls.add(i.get_attribute('href'))

                    new_image_urls = all_image_urls - image_urls
                    image_urls = all_image_urls
                    if len(image_urls) == old_image_urls_count:
                        page_count = num_pages
                        continue
                    old_image_urls_count = len(image_urls)

                    print(
                        "new images: %d so far we have %d image urls for tag %r"
                        % (len(new_image_urls), len(image_urls), tag))
                    for i in new_image_urls:
                        try:
                            crawler_task.apply_async(
                                kwargs={
                                    'klass_name': 'CreatorByInstagramHashtags',
                                    'task_type': 'create_profile',
                                    'url': i,
                                    'tag': tag,
                                    'category': category,
                                    'pipeline_class': pipeline_class
                                },
                                # Queue where tasks to create new profiles for separate posts in feed are put
                                queue='scrape_instagram_posts_new',
                            )
                        except:
                            print("some error for %s" % i)
                            pass
                    # find the next page button
                    # el = xb.driver.find_elements_by_xpath('//div[contains(@class, "moreLoadingIndicator")]//a')
                    el = xb.driver.find_elements_by_xpath(
                        '//a[contains(text(), "Load more")]')

                    if page_count == 0 and len(el) > 0:
                        e = el[0]
                        e.click()
                        log.info(
                            "Found next page button for page %s successfully, clicking and waiting."
                            % page_count)

                    else:
                        log.info(
                            "'Load More Pics' button not found... returning.")
                        #page_count = num_pages
                        # scroll to the bottom before we can find the 'load more pages' button
                        xb.driver.execute_script("window.scrollTo(0, 50);")
                        xb.driver.execute_script(
                            "window.scrollTo(0, 1000000);")
                    time.sleep(3)
                    page_count += 1

                # caching post quantity for this tag
                if tag is not None and isinstance(posts_qty, int):
                    cache.set('instagram_tag__%s' % tag, posts_qty)

Пример #28

0

Показать файл

Файл: fetch_blog_posts_date.py Проект: khsr/django-shelf

def fetch_blog_posts_date(url):
    """
    Searches for Post's Date in current blog posts's url.
    :param url: - url of the post
    :return: fetched datetime as string, method_name as string, title of the page
    """
    result = {
        'status_code': None,
        'title': None,
        'date_published': None,
        'description': None
    }
    if not url:
        return result

    try:
        # print('performing url: %s' % url)

        response = requests.get(url, timeout=10, headers=requests_headers)
        result['status_code'] = response.status_code
        if response.status_code >= 400:
            result['description'] = 'http_error'
            return result

        try:
            page = fromstring(
                bytes(
                    response.content.decode("UTF-8",
                                            "ignore").encode("UTF-8")))
        except:
            result['description'] = 'xml_malformed'
            return result

        title = ''
        # getting title
        titles = page.xpath("//meta[@property='og:title']/@content")
        if len(titles) > 0:
            title = titles[0]

        # First we getting title from /feed page if it exists. If not, we try to fetch /post/comments page.
        # If it does not exists also, we take title from the current page.
        if len(title) == 0:
            try:
                response = requests.get(
                    "%s%sfeed" % (url, '' if url.endswith('/') else '/'),
                    timeout=5,
                    headers=requests_headers)
                if response.status_code == 200:
                    pg = fromstring(response.content)
                    titles = pg.xpath('//title/text()')
                    if len(titles) > 0 and titles[0].startswith(
                            'Comments on: '):
                        title = titles[0].replace('Comments on: ', '', 1)
            except Timeout:
                pass

        if len(title) == 0:
            comments_default_urls = page.xpath(
                "//link[re:test(@href, 'feeds\/\d+\/comments\/default')]/@href",
                namespaces={'re': 'http://exslt.org/regular-expressions'})
            if len(comments_default_urls) > 0:
                try:
                    response = requests.get(comments_default_urls[0],
                                            timeout=5,
                                            headers=requests_headers)
                    if response.status_code == 200:
                        pg = fromstring(response.content)
                        titles = pg.xpath('//feed/title/text()')
                        if len(titles) > 0 and titles[0].startswith(
                                'Comments on '):
                            title = titles[0].replace('Comments on ', '', 1)
                except Timeout:
                    pass

        if len(title) == 0:
            page_title = page.xpath("//title/text()")
            # title = title[0].encode('ascii', 'ignore') if title else ''
            if page_title is not None and len(page_title) > 0:
                title = page_title[0]

        result['title'] = ' '.join(title.split())

        # 1. Fetching metas and fields where date format is standard
        meta_published_time = page.xpath(
            "//meta[@property='article:published_time']/@content")
        meta_sailthru_time = page.xpath(
            "//meta[@name='sailthru.date']/@content")
        time_datepublished = page.xpath(
            "//time[@itemprop='datePublished']/@datetime")
        abbr_datepublished = page.xpath(
            "//abbr[@itemprop='datePublished']/@title")
        # recheck
        time_pubdate = page.xpath("//time[@pubdate and @datetime]/@datetime")
        # this one needs to be checked additionally
        time_datetime = page.xpath("//time[@datetime]/@datetime")

        if meta_published_time:
            result['date_published'] = dateutil.parser.parse(
                meta_published_time[0])
            result['description'] = 'meta_published_time'
            return result
        elif meta_sailthru_time:
            result['date_published'] = dateutil.parser.parse(
                meta_sailthru_time[0])
            result['description'] = 'meta_sailthru_time'
            return result
        elif time_datepublished:
            result['date_published'] = dateutil.parser.parse(
                time_datepublished[0])
            result['description'] = 'time_datepublished'
            return result
        elif time_pubdate:
            result['date_published'] = dateutil.parser.parse(time_pubdate[0])
            result['description'] = 'time_pubdate'
            return result
        elif abbr_datepublished:
            result['date_published'] = dateutil.parser.parse(
                abbr_datepublished[0])
            result['description'] = 'abbr_datepublished'
            return result
        elif time_datetime:
            result['date_published'] = dateutil.parser.parse(time_datetime[0])
            result['description'] = 'time_datetime'
            return result

        # 2. Finding tags containing dates by regexps
        # common for blogspot and derivatives
        json_datepublished = page.xpath(
            ".//*[re:test(text(), '(?i)\"datePublished\"\s*:\s*\"[\d\w\:-]+\"')]/text()",
            namespaces={'re': 'http://exslt.org/regular-expressions'})
        if json_datepublished:
            for txt in json_datepublished:
                datetime_txt_part = re.findall(
                    r'(?i)\"datePublished\"\s*:\s*\"[\d\w\:-]+\"', txt)
                if datetime_txt_part and len(datetime_txt_part) > 0:
                    result['date_published'] = dateutil.parser.parse(
                        datetime_txt_part[0].split(':')[1])
                    result['description'] = 'json_datepublished'
                    return result

        nodes_with_date = page.xpath(
            ".//div[@class='date-outer']//h2[@class='date-header']/text()")
        nodes_with_date = nodes_with_date + page.xpath(
            ".//p[@class='blog-date']//span[@class='date-text']/text()")
        nodes_with_date = nodes_with_date + page.xpath(
            ".//div//span[@class='entry-date']/text()")

        # plain date search by text, like 'January 12, 2015'
        nodes_with_date = nodes_with_date + page.xpath(
            ".//*[re:test(text(), '%s')]/text()" % date_expression_01,
            namespaces={'re': 'http://exslt.org/regular-expressions'})

        if nodes_with_date:
            for txt in nodes_with_date:
                date_regexp_01_part = re.findall(date_expression_01, txt)
                if date_regexp_01_part and len(date_regexp_01_part) > 0:
                    result['date_published'] = dateutil.parser.parse(
                        date_regexp_01_part[0])
                    result['description'] = 'date_regexp_01'
                    return result
                date_regexp_02_part = re.findall(date_expression_02, txt)
                if date_regexp_02_part and len(date_regexp_02_part) > 0:
                    result['date_published'] = dateutil.parser.parse(
                        re.sub('[/]', '.', date_regexp_02_part[0]))
                    result['description'] = 'date_regexp_02'
                    return result
                date_regexp_03_part = re.findall(date_expression_03, txt)
                if date_regexp_03_part and len(date_regexp_03_part) > 0:
                    result['date_published'] = dateutil.parser.parse(
                        date_regexp_03_part[0])
                    result['description'] = 'date_regexp_03'
                    return result

        # if we did not get results this far, trying Google search
        xb = None
        try:
            # Creating webdriver with proxy
            proxy = random.choice(PROXY_CONFIGS)
            xb = xbrowser.XBrowser(
                headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                disable_cleanup=False,
                custom_proxy=proxy['http'])

            # setting timeouts to xb instance
            xb.driver.set_page_load_timeout(10)
            xb.driver.set_script_timeout(10)
            xb.driver.implicitly_wait(10)

            sleep(randint(8, 15))

            # opening google search page in browser
            try:
                xb.load_url('http://google.com?hl=en')
            except Exception as e:
                print(
                    'Exception while loading google.com page and performing scripts: %s'
                    % e)
                result['description'] = 'google_init_error'
                return result

            # finding input element, putting there the given url and press search button
            time.sleep(2)
            try:
                input_field = xb.driver.find_elements_by_xpath(
                    '//input[@title="Search"]')
                if input_field:
                    input_field = input_field[0]
                    input_field.send_keys(url)
                    time.sleep(1)
                    submit_button = xb.driver.find_elements_by_xpath(
                        '//button[@value="Search"]')
                    if submit_button:
                        submit_button[0].click()

            except NoSuchElementException as e:
                print('No input field found on google.com')
                print(e)
                result['description'] = 'google_input_locating_error'
                return result

            # fetching results from the page
            time.sleep(1)

            # Here we are fetching results from google result page for search
            try:
                g_divs = xb.driver.find_elements_by_xpath("//div[@class='g']")
                if g_divs:
                    for g_div in g_divs:
                        url_to_site = g_div.find_element_by_xpath(".//a")
                        if url in url_to_site.get_attribute('href'):
                            possible_date_chunk = g_div.find_element_by_xpath(
                                ".//span[@class='f']")
                            if possible_date_chunk:
                                if 'ago' in possible_date_chunk.text:
                                    date_chunks = possible_date_chunk.text.split(
                                    )
                                    if len(date_chunks) > 0 and is_integer(
                                            date_chunks[0]):
                                        result[
                                            'date_published'] = datetime.utcnow(
                                            ) - timedelta(
                                                days=int(date_chunks[0]))
                                        result['description'] = 'google_search'
                                        return result
                                else:
                                    date_chunks = "".join(
                                        possible_date_chunk.text.replace(
                                            "-", "").split(",")).split()
                                    if len(date_chunks) >= 3 and is_integer(
                                            date_chunks[1]) and is_integer(
                                                date_chunks[2]):
                                        result[
                                            'date_published'] = dateutil.parser.parse(
                                                possible_date_chunk.text.
                                                replace("-", ""))
                                        result['description'] = 'google_search'
                                        return result
                            break
            except NoSuchElementException:
                # did not find elements on a page (divs of results, url in div or date)
                pass
        except Exception as e:
            logger.exception(e, extra={'url': url})
        finally:
            if xb:
                try:
                    xb.driver.quit()
                except:
                    pass

        # PREVIOUS GOOGLE ALGORITHM WITH GOOGLE API
        # if len(result['title']) > 0:
        #     # q = ""
        #     # parsed_url = urlparse(url)
        #     # q += "site:%s " % parsed_url.netloc
        #     # q += " ".join(result['title'].split())
        #
        #     google_response = requests.get(
        #         'https://ajax.googleapis.com/ajax/services/search/web',
        #         params={
        #             'v': '1.0',
        #             'hl': 'ru',
        #             'gl': 'ru',
        #             # 'rsz': 8,
        #             # 'safe': 'active',
        #             # 'filter': 0,
        #             'q': url
        #         },
        #         timeout=10,
        #         headers={
        #             'Referer': 'http://theshelf.com',
        #             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.81 Chrome/43.0.2357.81 Safari/537.36',
        #         }
        #     )
        #
        #     # print('google response status: %s' % google_response.status_code)
        #     if google_response.status_code < 400:
        #         google_json = google_response.json()
        #         print('RESPONSE GOOGLE:')
        #         pprint(google_json)
        #         if google_json:
        #             response_data = google_json.get('responseData', [])
        #             if response_data and 'results' in response_data:
        #                 for entry in response_data['results']:
        #                     if entry['url'] == url:
        #                         possible_date_chunk = entry.get('content', '')[:12]
        #                         if 'ago' in possible_date_chunk:
        #                             date_chunks = possible_date_chunk.split()
        #                             if len(date_chunks) > 0 and is_integer(date_chunks[0]):
        #                                 result['date_published'] = datetime.utcnow() - timedelta(days=int(date_chunks[0]))
        #                                 result['description'] = 'google_search'
        #                                 return result
        #                         else:
        #                             date_chunks = "".join(possible_date_chunk.split(",")).split()
        #                             if len(date_chunks) == 3 and is_integer(date_chunks[1]) and is_integer(date_chunks[2]):
        #                                 result['date_published'] = dateutil.parser.parse(possible_date_chunk)
        #                                 result['description'] = 'google_search'
        #                                 return result

        result['description'] = 'date_not_found'
        return result

    except Timeout:
        result['description'] = 'status_timeout'
        return result
    except ConnectionError:
        result['description'] = 'connection_error'
        return result
    except TooManyRedirects:
        result['description'] = 'too_many_redirects'
        return result
    except DecodeError:
        result['description'] = 'decode_error'
        return result

Пример #29

0

Показать файл

Файл: influencer_creator.py Проект: khsr/django-shelf

    def detect_influencer(self):
        """
        Detects influencer according to the diagram

        :return: Influencer Id
        """
        self.report_data = dict()

        # checking if this profile has been performed before (if it has any IC_* actual tags)
        tags = self.profile.tags.split()
        if any(t in self.TAGS for t in tags):
            # looks like this profile was already performed, skipping it
            return 'already_preformed'

        # removing existing discovered_influencer if any presents
        present_influencer = self.profile.discovered_influencer
        if present_influencer is not None:
            self.profile.discovered_influencer = None
            if self.save is True:
                self.profile.save()

        # Getting profile's discovered platform ids
        existing_platform_ids = self.profile.get_platform_ids_detected()
        non_social_urls = self.profile.get_non_social_urls_detected()

        log.info('Detecting influencer for InstagramProfile %s ...' %
                 self.profile.id)

        self.report_data['profile_id'] = self.profile.id
        self.report_data['existing_platform_ids_qty'] = len(
            existing_platform_ids)
        self.report_data['non_social_urls_qty'] = len(non_social_urls)

        if len(existing_platform_ids) >= 1:
            log.info('Found %s platform ids' % len(existing_platform_ids))
            # There are at least 1 discovered existing platform for this Profile
            # fetching all platforms except those with url_not_found=True
            # UPDATE: and then detecting influencers of these platforms. If there is only one influencer - using it

            active_plats = Platform.objects.filter(
                id__in=existing_platform_ids).exclude(url_not_found=True)
            active_influencers_ids = set()
            for p in active_plats:
                if p.influencer is not None:
                    active_influencers_ids.add(p.influencer.id)

            active_influencers_ids = list(active_influencers_ids)

            self.report_data['active_influencers_ids'] = active_influencers_ids

            log.info(
                'Found %s existing platforms with %s distinctive influencers' %
                (len(existing_platform_ids), len(active_influencers_ids)))

            if len(active_influencers_ids) == 1:
                # Great! Only platforms with one distinctive influencers found, working with it: adding this
                # influencer to collection, connecting it to InstagramProfile

                log.info(
                    'Found 1 influencer (%s), setting IC_one_inf_found tag, setting '
                    'influencer to InstagramProfile' %
                    active_influencers_ids[0])

                candidate_influencer = Influencer.objects.get(
                    id=active_influencers_ids[0])

                if candidate_influencer.blog_url is not None and candidate_influencer.blog_url.startswith(
                        'http://www.theshelf.com/artificial_blog/'):
                    inf = Influencer.objects.get(id=active_influencers_ids[0])

                    # TODO: connecting existing artificial influencer?
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()

                        self.add_influencer_to_discovered_collection(
                            candidate_influencer)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_artificial_inf_found',
                            self.TAGS + self.obsolete_tags)

                    self.report_data[
                        'result'] = 'One existing influencer found (artificial/osos): %s (osos: %s / sos: %s)' % (
                            active_influencers_ids[0],
                            inf.old_show_on_search,
                            inf.show_on_search,
                        )
                    return 'IC_one_artificial_inf_found'
                else:
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()

                        self.add_influencer_to_discovered_collection(
                            candidate_influencer)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_inf_found', self.TAGS + self.obsolete_tags)

                    self.report_data['result'] = 'One existing influencer found and set to ' \
                                                 'profile (non-artificial, non-osos): %s (osos: %s / sos: %s)' % (
                        active_influencers_ids[0],
                        candidate_influencer.old_show_on_search,
                        candidate_influencer.show_on_search,
                    )
                    return 'IC_one_inf_found'

            elif len(active_influencers_ids) > 1:
                # We discovered more than one active platforms with more than one distinctive influencers.

                log.info(
                    'Found more than 1 platform with more than 1 distinctive '
                    'Influencers, setting tag IC_many_plats_found')

                # self.profile.append_mutual_exclusive_tag('IC_many_infs_found', self.TAGS)

                infs = Influencer.objects.filter(
                    id__in=active_influencers_ids,
                    old_show_on_search=True).exclude(blacklisted=True)

                if infs.count() == 0:
                    # None found, we pick the best _select_influencer_to_stay(),
                    # connect to the profile and add to the collection

                    active_infs = Influencer.objects.filter(
                        id__in=active_influencers_ids)
                    best_one = active_infs[0]._select_influencer_to_stay(
                        list(active_infs))

                    self.profile.discovered_influencer = best_one
                    if self.save is True:
                        self.profile.save()
                        # self.add_influencer_to_discovered_collection(best_one)
                        self.profile.append_mutual_exclusive_tag(
                            'IC_best_from_several',
                            self.TAGS + self.obsolete_tags)

                    several_infs = [
                        "%s  (osos: %s / sos: %s)" %
                        (inf.id, inf.old_show_on_search, inf.show_on_search)
                        for inf in active_infs
                    ]
                    self.report_data['result'] = 'Several existing influencers found (no osos=True): %s , ' \
                                                 'taken best of them: %s  (osos: %s / sos: %s)' % (
                        several_infs,
                        best_one.id,
                        best_one.old_show_on_search,
                        best_one.show_on_search
                    )

                    return 'IC_best_from_several'

                elif infs.count() == 1:
                    # One Influencer with old_show_on_search=True found, using it
                    candidate_influencer = infs[0]
                    self.profile.discovered_influencer = candidate_influencer
                    if self.save is True:
                        self.profile.save()
                        # self.add_influencer_to_discovered_collection(candidate_influencer)
                        self.profile.append_mutual_exclusive_tag(
                            'IC_one_from_several',
                            self.TAGS + self.obsolete_tags)

                    several_infs = [
                        "%s  (osos: %s / sos: %s)" %
                        (inf.id, inf.old_show_on_search, inf.show_on_search)
                        for inf in infs
                    ]
                    self.report_data['result'] = 'Several existing influencers found: %s , taken ' \
                                                 'one of them with osos=True: %s  (osos: %s / sos: %s)' % (
                        several_infs,
                        candidate_influencer.id,
                        candidate_influencer.old_show_on_search,
                        candidate_influencer.show_on_search,
                    )

                    return 'IC_one_from_several'

                else:
                    # Multiple found - adding these to collection of duplicates
                    if self.save is True:
                        self.add_influencers_to_duplicates_collection(
                            influencers=infs)

                        self.profile.append_mutual_exclusive_tag(
                            'IC_many_infs_found',
                            self.TAGS + self.obsolete_tags)

                    self.report_data['result'] = 'Several existing influencers found: %s, taken those with osos=True ' \
                                                 'and putting them to duplicates collection.' % [
                        "%s  (osos: %s / sos: %s)" % (inf.id,
                                                      inf.old_show_on_search,
                                                      inf.show_on_search) for inf in infs
                    ]

                return 'IC_many_infs_found'

        # There are 0 discovered platforms, checking with non-social urls
        if len(non_social_urls) == 0:
            # Creating influencer with artificial url, adding it to collection, connecting it to the profile

            log.info(
                'No non-social urls found, creating artificial Influencer and adding it to the profile'
            )

            count_str = '%s' % (int(time.time()))
            blog_url = 'http://www.theshelf.com/artificial_blog/%s.html' % count_str
            inf = create_influencer_and_blog_platform(
                blog_url,
                influencer_source='discovered_via_instagram',
                to_save=True,
                platform_name_fallback=True)

            self.profile.discovered_influencer = inf
            if self.save is True:
                self.profile.save()
                # TODO: Should we create here an instagram platform too?
                self.add_influencer_to_discovered_collection(inf)
                self.profile.append_mutual_exclusive_tag(
                    'IC_artificial_inf_created',
                    self.TAGS + self.obsolete_tags)

            log.info('Adding IC_artificial_inf_created tag')

            self.report_data['result'] = 'No social/non-social platforms found - creating ' \
                                         'artificial Influencer: %s (osos: %s / sos: %s).' % (inf.id,
                                                                                              inf.old_show_on_search,
                                                                                              inf.show_on_search)

            return 'IC_artificial_inf_created'

        else:
            # There are some non-social urls -- checking if there are unique non-social urls

            # Special shortcut: if non-social urls contain liketoknow.it url. If this url is found, then using it as a
            # blog url for this future influencer

            from platformdatafetcher.producturlsextractor import get_blog_url_from_liketoknowit

            # NEW logic to check for bloggy urls
            log.info(
                '%s non-social urls found: %s, trying to find unique root domains'
                % (len(non_social_urls), non_social_urls))

            blog_urls_found = []

            from platformdatafetcher.platformextractor import collect_social_urls_from_blog_url, \
                substitute_instagram_post_urls

            # detecting if any of non-social urls are blogs
            with xbrowsermod.XBrowser(
                    headless_display=settings.AUTOCREATE_HEADLESS_DISPLAY,
                    load_no_images=True,
                    disable_cleanup=False,
                    timeout=60) as xb:

                # social urls chunks, we need to prepare social urls into detectable chunks like 'www-less domain/path'
                social_chunks = []
                for url in self.profile.get_social_urls_detected():
                    parsed = urlparse(url)
                    chunk = '%s%s' % (parsed.netloc[4:]
                                      if parsed.netloc.startswith('www.') else
                                      parsed.netloc, parsed.path)
                    chunk = chunk.strip('/')
                    if chunk not in social_chunks:
                        social_chunks.append(chunk)

                log.info('Social url fragments for searching: %s' %
                         social_chunks)

                # detecting if any found socials in there
                non_social_urls = self.profile.get_non_social_urls_detected()
                unique_root_domains = self.get_unique_root_domains(
                    non_social_urls)
                for k in unique_root_domains.keys():
                    non_social_url_start = unique_root_domains[k][0]

                    # checking if this url is a good liketoknow.it url and blog url can be retrieved:
                    parsed = urlparse(non_social_url_start)
                    # checking if domain is liketoknow.it
                    if parsed.netloc.lower().strip().replace('www.', '', 1) == 'liketoknow.it' and \
                            parsed.path.lower().strip('/').strip() not in ['', 'login']:

                        log.info(
                            'Liketoknow.it url detected: %r , trying to get its blog url'
                            % non_social_url_start)

                        # looks like it is a good liketoknow.it url, getting blog url
                        blog_url = get_blog_url_from_liketoknowit(
                            non_social_url_start, xb)
                        if blog_url is not None:
                            log.info(
                                'Blog url detected successfully: %r , considering it a good blog url'
                                % blog_url)
                            # adding it to blog_urls detected
                            if blog_url not in blog_urls_found:
                                blog_urls_found.append(blog_url)
                            else:
                                log.info('Blog url %r is already detected' %
                                         blog_url)
                        else:
                            log.info('Blog url was not detected')

                    else:
                        is_blog_url, non_social_url = self.is_url_a_blog(
                            non_social_url_start, self.profile)
                        log.info('Checking if %r is a blog:' % non_social_url)
                        if is_blog_url is True and non_social_url is not None:
                            log.info('Perfect, %r is a blog' % non_social_url)
                            socials_detected = []
                            found_soc_urls = defaultdict(list)
                            collect_social_urls_from_blog_url(
                                xb=xb,
                                by_pname=found_soc_urls,
                                platform=None,
                                non_social_url=non_social_url)

                            substitute_instagram_post_urls(found_soc_urls)

                            log.info('SOCIAL URLS COLLECTED: %s' %
                                     found_soc_urls)

                            # if no social urls were collected, we're checking if this non-social url has
                            # social urls in any form with regexps by its content and iframes.
                            if len(found_soc_urls) == 0:
                                scraped_social_urls = collect_any_social_urls(
                                    xb=xb, non_social_url=non_social_url)
                                log.info(
                                    'Thorough search found %s candidate social urls '
                                    'to check' % len(scraped_social_urls))
                                found_soc_urls[
                                    'Bruteforce'] = scraped_social_urls

                            # found_socials is in format {'Instagram': ['url1', 'url2',...], 'Facebook': [...], ...}
                            for social_url_lst in found_soc_urls.values():
                                for social_url in social_url_lst:
                                    if any([
                                            sc.lower() in social_url.lower()
                                            for sc in social_chunks
                                    ]):
                                        # we found one of social chunks in detected social url
                                        if social_url not in socials_detected:
                                            socials_detected.append(social_url)

                            log.info('Positively matched social urls: %s' %
                                     socials_detected)

                            # if we found some matching social urls - then it is a blog url, TA-DAAAA!
                            if len(socials_detected) > 0:
                                if non_social_url not in blog_urls_found:
                                    # TODO: should we use here self.is_url_a_blog(url, self.profile) for extra blog check?
                                    blog_urls_found.append(non_social_url)
                                    log.info(
                                        'Considering url %r to be a blog url for this profile'
                                        % non_social_url)

                        else:
                            log.info(
                                'Url %r considered as non-blog url or is unreachable'
                                % non_social_url_start)

            if len(blog_urls_found) == 1:
                # we found 1 blog url
                log.info('Looks like it is a new single blog url!')
                self.report_data['unique_root_domain_is_blog'] = True

                # Here we have found 0 existing platforms, but we detected that a single non-social url
                # is a BLOG. So we create a blog platform with this url, creating an influencer, connecting
                # this blog platform to this influencer and connecting the influencer to the profile.

                # creating new blog platform
                inf = create_influencer_and_blog_platform(
                    blog_url=blog_urls_found[0],
                    influencer_source='ic_from_insta_profile',
                    to_save=self.save,
                    platform_name_fallback=True)
                self.profile.discovered_influencer = inf
                log.info('A new influencer has been created: %s' % inf)
                if self.save is True:
                    self.profile.save()
                    self.add_influencer_to_discovered_collection(inf)
                    self.profile.append_mutual_exclusive_tag(
                        'IC_new_blog_new_inf', self.TAGS + self.obsolete_tags)

                self.report_data['result'] = 'New influencer %s (osos: %s / sos: %s) created by single ' \
                                             'non-social blog platform' % (inf.id,
                                                                           inf.old_show_on_search,
                                                                           inf.show_on_search)

                return 'IC_new_blog_new_inf'

            elif len(blog_urls_found) == 0:
                # if none found to be a blog
                #   => check if the length of the url > 20 chars (typically identifies as a
                #           product) => then this profile needs to be fetched again later
                #     => create a new field "date_to_fetch_later" in InstagramProfile and update this field
                #           with today+10 days later
                #     => need to create a celery task that checks if today is the day when they should be
                #           re-fetched and then clears up this date_to_fetch_later to None
                #     => after fetching the profile, compare the old url and description with new one, check
                #           if it's different, then pass it to the same pipeline as it was originally part of

                log.info('No blog urls were detected within non_social_urls')

                # TODO: what should we do if this already has date_to_fetch_later != None ?
                long_url = False
                for non_social_url in non_social_urls:
                    if len(non_social_url) > 20:
                        self.profile.date_to_fetch_later = datetime.now(
                        ) + timedelta(days=10)
                        if self.save is True:
                            self.profile.save()
                        long_url = True
                        break

                if long_url is True:

                    self.report_data[
                        'result'] = 'No blog urls were found, retrying in 10 days'
                    return '10_days_later'
                else:
                    # TODO: What should we do here, should we create an artificial url?

                    if self.save is True:
                        self.profile.append_mutual_exclusive_tag(
                            'IC_possible_brand',
                            self.TAGS + self.obsolete_tags)

                    self.report_data[
                        'result'] = 'Profile considered to be possibly a brand.'
                    return 'IC_possible_brand'

            else:
                # TODO: Skipping for now...

                log.info(
                    'We found many non-social blog domains, setting IC_many_nonsocial_found tag:'
                    % blog_urls_found)

                if self.save is True:
                    self.profile.append_mutual_exclusive_tag(
                        'IC_many_nonsocial_found',
                        self.TAGS + self.obsolete_tags)

                self.report_data[
                    'result'] = 'Multiple unique root domains found, skipped for now'
                return 'IC_many_nonsocial_found'

Python XBrowser примеры использования