Python domain_from_url示例，xpathscraper.utils.domain_from_url Python示例

示例#1

0

显示文件

文件： petester.py 项目： khsr/django-shelf

def petester_single(url):
    rows = [row for row in blogger_outreach_data() if utils.domain_from_url(row['url']) == \
            utils.domain_from_url(url)]
    print rows
    assert rows
    pet = PETester(rows)
    pet.test()

示例#2

0

显示文件

    def update_influencers_email(self, to_save=False, max_visited_links=20):
        log.info('Initial email field value for %r: %r',
                 self.source_platform.influencer,
                 self.source_platform.influencer.email)

        self.xbrowser.load_url(self.source_platform.url)
        urls = self.xbrowser.execute_jsfun_safe(
            [], '_XPS.visibleLinksWithTexts',
            ['contact', 'about', 'social', 'media', 'follow'], 40)
        urls = [u for u in urls if utils.domain_from_url(u) == \
                            utils.domain_from_url(self.xbrowser.driver.current_url)]
        urls = [u for u in urls if urlparse.urlsplit(u).path.rstrip('/')]
        urls = utils.unique_sameorder(urls)
        log.info('Urls to visit in search for emails: %r', urls)

        for page_url in urls[:max_visited_links]:
            try:
                self.xbrowser.load_url(page_url)
                updated = self._update_from_current_page(to_save)
                if updated:
                    log.info('Current page contained email')
            except:
                log.exception('While processing %r, skipping', page_url)
        log.info('Final email field value for %r: %r',
                 self.source_platform.influencer,
                 self.source_platform.influencer.email)

示例#3

0

显示文件

文件： customblogs.py 项目： khsr/django-shelf

def _find_platform(blog_url):
    blog_domain = utils.domain_from_url(blog_url)
    pl_candidates = models.Platform.objects.filter(url__contains=blog_domain)
    for pl in pl_candidates:
        if utils.domain_from_url(pl.url) == blog_domain:
            return pl
    return None

示例#4

0

显示文件

def get_about_page_links(xbrowser):
    links = xbrowser.execute_jsfun_safe(
        [], '_XPS.visibleLinksWithTexts',
        ['contact', 'about', 'social', 'media', 'follow'], 40)
    links = [
        l for l in links if utils.domain_from_url(l) == utils.domain_from_url(
            xbrowser.driver.current_url)
    ]
    return utils.unique_sameorder(links)

示例#5

0

显示文件

文件： customblogs.py 项目： khsr/django-shelf

def submit_blog_task_by_url(url):
    blogs = _read_blogs()
    matching = [
        b for b in blogs
        if utils.domain_from_url(url) == utils.domain_from_url(b['blog_url'])
    ]
    log.info('Found matching blogs: %r', matching)
    if not matching:
        return
    handle_blog(matching[0])

示例#6

0

显示文件

文件： linkextractor.py 项目： khsr/django-shelf

    def extract_links(self, to_save=False):
        posts_data = list(self.platform.posts_set.all().\
                          order_by('-create_date').\
                          values('url')\
                          [:COMMON_LINKS_POSTS])
        if not posts_data:
            log.warn('No posts for common links search')
            return []
        posts_urls = [d['url'] for d in posts_data]

        log.info('posts_urls: %r', posts_urls)

        # Maps link kind to a dictionary mapping a url to a set of urls
        by_kind = defaultdict(dict)
        for url in posts_urls:
            log.info('Fetching content from %r', url)
            by_kind['common_external'][url] = set()
            by_kind['common_internal'][url] = set()
            html_it = iter(utils.fetch_iframes(url))
            while True:
                try:
                    html = html_it.next()
                except StopIteration:
                    break
                except:
                    log.exception('While fetching html, skipping this url')
                    continue
                links_texts = contentfiltering.find_links_with_texts(html)
                links_texts = [(u, t) for (u, t) in links_texts
                               if not u.endswith(UNWANTED_EXTS)]
                links_texts = [(u, t) for (u, t) in links_texts \
                               if not any(ss in u for ss in BLACKLISTED_URL_SUBSTRINGS)]
                by_kind['common_external'][url].update([(u, t) for (u, t) in links_texts \
                      if utils.domain_from_url(platformutils.url_to_handle(u)) != self.source_handle])
                by_kind['common_internal'][url].update([(u, t) for (u, t) in links_texts \
                      if utils.domain_from_url(platformutils.url_to_handle(u)) == self.source_handle])
        common = defaultdict(dict)
        for kind, links_texts_by_url in by_kind.items():
            nonempty_sets = [s for s in links_texts_by_url.values() if s]
            if len(nonempty_sets) < 2:
                log.warn('Not enough nonempty sets of links from posts for %s',
                         kind)
                common[kind] = set()
                continue
            common[kind] = sorted(set.intersection(*nonempty_sets))
            common[kind] = filter_links_texts(common[kind])
            log.info('Common links of kind %r (%d):\n%s', kind,
                     len(common[kind]), pformat(common[kind]))

        res = []
        for kind, common_links_texts in common.items():
            res += save_links(self.platform, kind, common_links_texts, to_save)
        return res

示例#7

0

显示文件

文件： linkextractor.py 项目： khsr/django-shelf

 def extract_links(self, to_save=False):
     clusters = xutils.find_navigation_links_clusters(self.xbrowser)
     # flatten all clusters
     els = [el for cluster in clusters for el in cluster]
     links_texts = [(el.get_attribute('href'), el.text) for el in els
                    if el.get_attribute('href')]
     log.debug('links_texts: %r', links_texts)
     links_texts = utils.unique_sameorder(links_texts, key=lambda lt: lt[0])
     links_texts = [(link, text) for (link, text) in links_texts \
                    if utils.domain_from_url(link) == \
                       utils.domain_from_url(self.xbrowser.driver.current_url) and \
                       utils.url_contains_path(link)]
     return save_links(self.platform, 'navigation', links_texts, to_save)

示例#8

0

显示文件

def filter_urls(urls, exclude_domains_from_urls):
    domains = set()
    for eurl in exclude_domains_from_urls:
        if not eurl.startswith('http'):
            eurl = 'http://%s' % eurl
        domains.add(utils.domain_from_url(eurl))
        domains.add('www.%s' % utils.domain_from_url(eurl))
    res = []
    for url in urls:
        if utils.domain_from_url(url) in domains:
            continue
        res.append(url)
    return res

示例#9

0

显示文件

    def search(self, query, pages):
        input_el = self.xb.driver.find_element_by_xpath(
            '//input[@type="text"]')
        input_el.send_keys(query)
        time.sleep(1)
        self._find_search_button().click()
        time.sleep(5)
        #self._ensure_more_results()
        self.block_if_captcha()

        for page_no in xrange(pages):
            current_domains = [
                utils.domain_from_url(u) for u in self._current_results()
            ]
            current_domains = [cd.split(' ', 1)[0] for cd in current_domains]
            current_urls = ['http://%s' % u for u in current_domains]
            log.info('Current google results: %s', current_urls)
            yield current_urls

            self._sleep_before_clicking_next()
            next_el = self.xb.driver.find_element_by_id('pnnext')
            next_el.click()
            time.sleep(5)
            #self._ensure_more_results()
            self.block_if_captcha()

示例#10

0

显示文件

文件： streak_integration.py 项目： khsr/django-shelf

def mark_brand_signup(**kw):
    try:
        streak = Streak()
        pipeline = streak.get_pipeline_by_name('2016')
        stage = pipeline.get_stage_by_name(
            'TESTING' if settings.DEBUG else 'New Leads')
        box = stage.create_box(
            utils.domain_from_url(kw.get('brand_signedup_url')))
        box.update_fields({
            'Brand Name':
            kw.get('brand_signedup_brand_name'),
            'Brand URL':
            kw.get('brand_signedup_url'),
            'Created':
            int(time.mktime(datetime.datetime.now().timetuple()) * 1000),
            'Email':
            kw.get('brand_signedup_email'),
            'Person':
            '{} {}'.format(kw.get('brand_signedup_first_name'),
                           kw.get('brand_signedup_last_name')),
            'Marketing Signup Page':
            kw.get('referer_tag'),
        })
    except:
        pass

示例#11

0

显示文件

 def _has_mostly_valid_products(self, url):
     domain = utils.domain_from_url(url)
     brand_q = models.Brands.objects.filter(domain_name=domain)
     if not brand_q.exists():
         log.info('No brands for domain %r', domain)
         return False
     brand = brand_q[0]
     if brand.supported:
         log.info('Brand is supported, so it must be valid')
         # return True
     valid_products = brand.productmodel_set.\
         filter(price__isnull=False).\
         exclude(price=-11).\
         count()
     invalid_products = (brand.productmodel_set.filter(price__isnull=True) |
                         brand.productmodel_set.filter(price=-11)).\
         count()
     log.info('Brand %r has %d valid and %d invalid products', brand,
              valid_products, invalid_products)
     if valid_products + invalid_products < 5:
         log.info(
             'The number of products is too small to make an estimation')
         return False
     # 60% must be valid
     if float(valid_products) / float(valid_products +
                                      invalid_products) > 0.60:
         log.info(
             'Large number of products have valid prices, assuming it is a brand'
         )
         return True
     log.info('The number of valid products is too small')
     return False

示例#12

0

显示文件

def create_influencer_from_bad_brands(brand, to_save=True):
    '''
    This method creates influencers from Brands whose domains contain blogger urls.
    Example:
        blogspot = Brands.objects.filter(domain_name__icontains='blogspot.")
        blogspot.update(blacklisted=True)
        for b in blogspot:
          create_influencer_from_bad_brands(b, True)


        Double checks:
            this function should be called only for those Brands that have not been passed through this function
            we shouldn't run this for brands with domain_name in 'tumblr.com', because these influencer could have
                a separate blog (say on blogspot.com) and then we will have duplicates

    '''
    with platformutils.OpRecorder(operation='import_from_bad_brand',
                                  brand=brand) as opr:
        url = brand.domain_name
        domain = utils.domain_from_url(url)
        if domain in BLACKLISTED_DOMAINS:
            log.info('Domain %r is blacklisted', domain)
            return
        inf = helpers.create_influencer_and_blog_platform(
            url,
            'discovered_from_brands',
            to_save,
            platform_name_fallback=True)
        if not inf:
            log.error('Blacklisted url: %r', url)
        if inf and inf.id is not None:
            opr.data = {'inf_id_created': [inf.id]}
        else:
            opr.data = {'inf_cnt_skipped': 1}

示例#13

0

显示文件

def import_from_post_content(post_id, to_save=True):
    global _DOMAINS_OF_POPULAR_BRANDS

    if _DOMAINS_OF_POPULAR_BRANDS is None:
        log.info('Starting loading _DOMAINS_OF_POPULAR_BRANDS')
        popular_brands = models.Brands.objects.\
            filter(blacklisted=False).\
            filter(num_items_shelved__gte=5).\
            exclude(name='www').\
            annotate(num_products=Count('productmodel')).\
            order_by('-num_products')[:100]
        _DOMAINS_OF_POPULAR_BRANDS = [
            utils.domain_from_url(b.domain_name) for b in popular_brands
        ]
        log.info('Finished loading _DOMAINS_OF_POPULAR_BRANDS')

    post = models.Posts.objects.get(id=int(post_id))
    with platformutils.OpRecorder(operation='import_from_post_content',
                                  post=post) as opr:
        log.info('import_from_post_content for %r', post)
        _do_import_from_content(post.content,
                                opr,
                                to_save,
                                blacklisted_domains=BLACKLISTED_DOMAINS +
                                _DOMAINS_OF_POPULAR_BRANDS +
                                estimation.URL_FRAGMENTS_NO_RESOLVING +
                                estimation.URL_FRAGMENTS_REQUIRING_RESOLVING +
                                estimation.URL_FRAGMENTS_IN_IFRAMES)

示例#14

0

显示文件

def search_infs_using_preloaded_urls(queries, pages=20):
    for q in queries:
        try:
            urls = collect_urls_from_google(q, pages)
        except:
            log.exception(
                'While collect_urls_from_google(%r), going to the next query',
                q)
            continue
        print "Got urls: %s" % urls
        return
        for url in urls:
            try:
                if utils.domain_from_url(
                        url) in import_from_blog_post.exclude_domains_set:
                    log.warn('%r is blacklisted', url)
                    continue
                dups = models.Influencer.find_duplicates(url)
                log.info('%r dups: %s', url, dups)
                if not dups:
                    log.info('YES_CREATE %r', url)
                    new_inf = helpers.create_influencer_and_blog_platform(
                        url, 'google', platform_name_fallback=True)
                    log.info('Created influencer: %r', new_inf)
                else:
                    log.info('NO_CREATE %r', url)
            except:
                log.exception('While processing url %r, skipping', url)

示例#15

0

显示文件

文件： linkextractor.py 项目： khsr/django-shelf

def filter_links_texts(links_texts):
    res = []
    for url, text in links_texts:
        domain = utils.domain_from_url(url)
        if domain in BLACKLISTED_DOMAINS:
            continue
        res.append((url, text))
    return res

示例#16

0

显示文件

def get_or_create_brand(url):
    domain = utils.domain_from_url(url)
    brand, created = debra.models.Brands.objects.get_or_create(domain_name=domain)
    if created:
        brand.name = domain
        brand.save()
        brand_helpers.create_profile_for_brand(brand)
    return brand

示例#17

0

显示文件

文件： platformutils.py 项目： khsr/django-shelf

def meaningful_domain_fragment(url):
    url = url.lower()
    if social_platform_name_from_url(None, url) != PLATFORM_NAME_DEFAULT:
        return None
    domain = utils.domain_from_url(url)
    domain = utils.strip_last_domain_component(domain)
    parts = domain.split('.')
    parts = [p for p in parts if p not in ['blogspot', 'wordpress']]
    return ''.join(parts) or None

示例#18

0

显示文件

    def clean(self):
        cleaned_data = super(BloggerRegistrationForm, self).clean()
        cleaned_data["email"] = cleaned_data["email"].lower()
        email = cleaned_data.get("email")
        entered_blog_url = cleaned_data["blog_url"].lower()
        if not entered_blog_url.startswith(
                "http://") and not entered_blog_url.startswith("https://"):
            cleaned_data["blog_url"] = "http://" + cleaned_data["blog_url"]

        def is_valid_url(url):
            s = socket.socket()
            try:
                s.connect((url, 80))
            except Exception:
                print "Bad url", cleaned_data["blog_url"]
                return False
            else:
                return True

        domains = [
            utils.domain_from_url(cleaned_data["blog_url"],
                                  preserve_www=False),
            utils.domain_from_url(cleaned_data["blog_url"], preserve_www=True)
        ]

        if not any(map(is_valid_url, domains)):
            raise forms.ValidationError(
                _(u'Your blog url seems to be invalid. Please double check it.'
                  ))

        # make sure another user with the given email doesnt already exist
        try:
            user = User.objects.get(username__iexact=email)
            if cleaned_data['influenity_signup']:
                pass
                # if not user.check_password(cleaned_data['password']):
                #     raise forms.ValidationError(_(u'Wrong password'))
            else:
                raise forms.ValidationError(
                    _(u'Another user with the given email already exists'))
        except User.DoesNotExist:
            pass

        return cleaned_data

示例#19

0

显示文件

def brands_signup_postprocess(user_profile, form, distinct_id=None):
    #site = Site.objects.get(id=settings.SITE_ID)

    from debra.models import Brands

    domain_name = utils.domain_from_url(form.cleaned_data['brand_url'])
    print "DOMAIN_NAME: %s" % domain_name

    brands = Brands.objects.filter(domain_name=domain_name)
    if brands.exists():
        brand = brands[0]
        created = False
    else:
        brand = Brands.objects.create(domain_name=domain_name)
        created = True
    print "created: %s " % created
    print "brand: %s" % brand

    user_profile.temp_brand_domain = domain_name
    user_profile.save()
    user_profile.create_in_intercom()

    if form.data.get('from_admin') == 'true':
        user_profile.intercom_tag_add('dont-send-intro-email')
        user_profile.intercom_tag_add('customer_ignore')

    if form.referer_tag:
        user_profile.intercom_tag_add(form.referer_tag)
    # referer_page = urlparse.urlparse(form.referer).path.strip('/').split('/')[0]
    # print '* REFERER:', referer_page
    # try:
    #     tag = {
    #         '': 'home',
    #         'blogger-outreach': 'newbie',
    #         'influencer-marketing': 'expert',
    #         'agencies': 'agency',
    #         'blogger-campaign-services': 'services',
    #         'coverage': 'coverage',
    #         'the-blog': 'blog',
    #         'blogger-roundups': 'roundups',
    #     }[referer_page]
    # except KeyError:
    #     pass
    # else:
    #     user_profile.intercom_tag_add(tag)

    # if this is a new brand we know the user signing up is the brand manager. Otherwise, users have to claim the brand from us
    if created:
        brand.name = form.cleaned_data['brand_name']
        brand.save()
        brand_helpers.create_profile_for_brand(brand)

    intercom_track_event(None, 'brand-signed-up', {
        'user_email': user_profile.user.email,
        'brand_url': domain_name
    }, user_profile.user)

示例#20

0

显示文件

文件： linkextractor.py 项目： khsr/django-shelf

def domain_to_platform(domain):
    global _DOMAIN_TO_PLATFORM_ID
    if _DOMAIN_TO_PLATFORM_ID is None:
        log.info('Start fetching platform data')
        _DOMAIN_TO_PLATFORM_ID = {}
        for d in models.Platform.objects.all().values('id', 'url'):
            _DOMAIN_TO_PLATFORM_ID[utils.domain_from_url(d['url'])] = d['id']
        log.info('Finished')
    if domain not in _DOMAIN_TO_PLATFORM_ID:
        return None
    return models.Platform.objects.get(id=_DOMAIN_TO_PLATFORM_ID[domain])

示例#21

0

显示文件

文件： linkextractor.py 项目： khsr/django-shelf

 def extract_links(self, to_save=False):
     html = self.xbrowser.driver.execute_script(
         'return document.body.innerHTML')
     domain = utils.domain_from_url(self.xbrowser.driver.current_url)
     urls = contentfiltering.find_important_urls(html,
                                                 [domain, 'www.' + domain])
     log.info('important urls (%s): %s', len(urls), urls)
     res = []
     for u in urls:
         pl = domain_to_platform(utils.domain_from_url(u))
         if pl is not None and pl.id != self.platform.id:
             log.info('detected link from <<%s>> to <<%s>> url <<%s>>',
                      self.platform, pl, u)
             lfp = models.LinkFromPlatform(source_platform=self.platform,
                                           dest_platform=pl,
                                           dest_url=u)
             if to_save:
                 lfp.save()
             res.append(lfp)
     return res

示例#22

0

显示文件

 def _get_brand(prod_url):
     log.debug("_get_brand for %s" % prod_url)
     domain = utils.domain_from_url(prod_url)
     brand, created = Brands.objects.get_or_create(domain_name=domain)
     if brand.name == 'Nil':
         brand.name = domain.replace('www.',
                                     '').replace('.com',
                                                 '').replace('/', '')
         brand.save()
     log.debug("Created: %s Brand: %s Domain: %s" %
               (created, brand, domain))
     return brand

示例#23

0

显示文件

def find_common_links(xbrowser, urls):
    domains = [utils.domain_from_url(u) for u in urls]
    assert len(
        set(domains)) == 1, 'urls are not for the same domain: %s' % domains
    domain = domains[0]
    links_by_url = {}
    for u in urls:
        xbrowser.load_url(u)
        links_by_url[u] = xbrowser.execute_jsfun('_XPS.visibleLinksToDomains',
                                                 [domain], True)
        links_by_url[u] = [link.strip() for link in links_by_url[u]]
    common_links = set.intersection(*[set(v) for v in links_by_url.values()])
    return common_links

示例#24

0

显示文件

文件： scripts.py 项目： khsr/django-shelf

def import_network_bloggers(filename):

    with open(filename, 'rb') as f:
        lines = f.readlines()[1:]
    reader = csv.DictReader(lines,
                            ('unusual', 'blog_name', 'url', 'persons_name',
                             'location', 'source', 'description'))
    blogger_type = os.path.basename(filename).split('.')[0].split(' - ')[1]
    log.info('blogger_type: %r', blogger_type)
    for row in reader:
        try:
            log.info('row: %r', row)
            if not row['url'].startswith('http'):
                log.warn('Skipping row with invalid url %r', row['url'])
                continue
            source = utils.domain_from_url(row['source'])
            if not source.strip():
                log.warn('Skipping row with no source')
                continue
            if not row['url'].strip():
                log.warn('Skipping row with no url')
                continue
            inf = helpers.create_influencer_and_blog_platform(
                row['url'], source, to_save=True, platform_name_fallback=True)
            if not inf:
                log.warn('Skipping blacklisted url')
                continue
            if not inf.is_enabled_for_automated_edits():
                log.warn(
                    'Influencer is not enabled for automated edits, skipping')
                continue
            inf.blogname = row['blog_name']
            inf.blogger_type = blogger_type
            inf.name = row['persons_name']
            inf.demographics_location = row['location']
            inf.description = row['description']
            log.info(
                'source, blogname, name, location, description: %r, %r, %r, %r, %r',
                inf.source, inf.blogname, inf.name, inf.demographics_location,
                inf.description[:100])
            inf.save()

            # update blogname for blog platform
            blog_pl_q = inf.platform_set.filter(url=row['url'])
            if blog_pl_q.exists():
                blog_pl = blog_pl_q[0]
                log.info('Updating blogname of %r', blog_pl)
                blog_pl.blogname = row['blog_name']
                blog_pl.save()
        except:
            log.exception('While processing %s, skipping', row)

示例#25

0

显示文件

def do_extract_product_urls(url):
    domain = utils.domain_from_url(url)
    matching_classes = [
        cls for cls in CLASSES if domain in cls.supported_domains
    ]
    res = []
    for cls in matching_classes:
        e = cls()
        e_res = e.extract_product_urls(url)
        log.info('%r extracted product urls: %r', e, e_res)
        res += e_res
    res = utils.unique_sameorder(res)
    log.info('All product urls extracted from %r: %r', url, res)
    return res

示例#26

0

显示文件

 def extract_product_urls(self, url):
     try:
         with xbrowser.XBrowser(headless_display=settings.
                                AUTOCREATE_HEADLESS_DISPLAY) as xb:
             xb.load_url(url)
             anchors = WebDriverWait(xb.driver, 10).until(
                 lambda _: xb.els_by_xpath('//div[@class="hoverflow"]//a'))
             anchors = [a for a in anchors if a.get_attribute('href') and \
                        utils.domain_from_url(a.get_attribute('href')) == 'rstyle.me']
             urls = utils.unique_sameorder(
                 a.get_attribute('href') for a in anchors)
             return urls
     except Exception as e:
         log.exception(e, extra={'url': url})
         return None

示例#27

0

显示文件

def _get_product_urls(post):
    product_urls_in_post = post.product_urls(exclude_domains)

    # add urls from text links for non-blog platforms
    if not post.platform.platform_name_is_blog:
        content = platformutils.iterate_resolve_shortened_urls(post.content)
        product_urls_in_post.update(
            contentfiltering.filter_urls(
                contentfiltering.find_all_urls(content), exclude_domains))

    log.debug("We have %d product urls in the post content: %s" %
              (len(product_urls_in_post), product_urls_in_post))
    post.test_and_set_sponsored_flag()

    product_urls_in_widgets = sponsorshipfetcher.get_product_urls(post.id)
    log.debug("Products in widgets: %s" % product_urls_in_widgets)
    log.debug("We have %d product urls in the widget " %
              len(product_urls_in_widgets))

    additional_product_url_candidates = []
    if post.pin_source:
        additional_product_url_candidates.append(post.pin_source)

    influencer_blog_platforms = post.influencer.platform_set.filter(
        platform_name__in=Platform.BLOG_PLATFORMS)
    additional_product_urls = contentfiltering.filter_urls(
        additional_product_url_candidates,
        exclude_domains + [plat.url for plat in influencer_blog_platforms])

    product_urls = product_urls_in_post.union(product_urls_in_widgets).union(
        additional_product_urls)

    # extract product urls from embedded urls
    urls_for_urls_extraction = [
        u for u in product_urls if utils.domain_from_url(u) in
        producturlsextractor.ALL_SUPPORTED_DOMAINS
    ]
    products_urls_extracted = []
    for url in urls_for_urls_extraction:
        products_urls_extracted += producturlsextractor.do_extract_product_urls(
            url)
    log.info('All products_urls_extracted: %r', products_urls_extracted)
    product_urls.update(products_urls_extracted)

    return product_urls

示例#28

0

显示文件

文件： scripts.py 项目： khsr/django-shelf

def influencers_without_blog_platform_by_domain():
    infs = Influencer.objects.filter(source='spreadsheet_import',
                                     blog_url__isnull=False)
    invalid_urls = []
    for i, inf in enumerate(infs):
        print i
        if not inf.platform_set.filter(
                platform_name__in=['Custom', 'Blogspot', 'Wordpress'
                                   ]).exists():
            invalid_urls.append(inf.blog_url)

    print 'Got %s invalid urls: %r' % (len(invalid_urls), invalid_urls)
    by_domain = defaultdict(list)
    for url in invalid_urls:
        by_domain[_master_domain(utils.domain_from_url(url))].append(url)
    by_domain_items = sorted(by_domain.items(),
                             key=lambda (domain, urls): len(urls),
                             reverse=True)
    pprint.pprint(by_domain_items)

示例#29

0

显示文件

def create_influencers_from_blacklisted_brands():
    blogspot_brands = models.Brands.objects.filter(
        domain_name__icontains='blogspot')
    print "Got %d brands with blacklisted" % blogspot_brands.count()

    good_urls = []
    for i, b in enumerate(blogspot_brands):
        print "%d %r" % (i, b)
        url = b.domain_name.lower()
        if utils.domain_from_url(
                url) in import_from_blog_post.exclude_domains_set:
            log.warn('%r is blacklisted', url)
            continue
        dups = models.Influencer.find_duplicates(url)
        log.info('%r dups: %s', url, dups)
        if not dups:
            print "Can create a new influencer for %s" % url
            good_urls.append(url)
        print "Good urls so far: %d" % len(good_urls)

示例#30

0

显示文件

    def _get_own_frames(self, url, tree):
        if not url or tree is None:
            return []

        frame_like = tree.xpath('//iframe') + tree.xpath('//frame')
        srcs_to_check = []
        valid_fragments = [
            'blogspot',
            platformutils.meaningful_domain_fragment(url)
        ]
        valid_fragments = [vf for vf in valid_fragments if vf]
        for fl in frame_like:
            src = fl.attrib.get('src')
            if not src:
                continue
            domain = utils.domain_from_url(src)
            if any(vf in domain for vf in valid_fragments):
                srcs_to_check.append(src)
        log.info('srcs_to_check: %r', srcs_to_check)
        srcs_to_check = srcs_to_check[:self.DISCOVERY_FRAME_LIMIT]
        return srcs_to_check