Пример #1
0
 def _add_nested(self, k, el):
     """Parse nested element by its children."""
     el = Pq(el)
     tagname = Pq(el)[0].tag
     if tagname in self.invalid_tags:
         return
     id = self._format_id(el.attr('id'))
     classes = self._format_classes(el.attr('class'))
     selector = self._format_selector(el, id, classes)
     children = Pq(el).children()
     if not self._is_root_body_node(el):
         return
     # Add for single nodes only
     if not children:
         self.selectors.add(selector)
     # Build nested css by traversing all child nodes and getting
     # their attributes.
     while children:
         for child in children:
             # 1. Add current
             self.selectors.add(selector)
             # 2. Add child
             child = Pq(child)
             selector += self._add_id_and_classes(child)
             self.selectors.add(selector)
             # # 3. Move to next children
             children = child.children()
Пример #2
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                print '// Drop queryString in included src'
                print 'from: ', href
                result = urlparse(href)

                if result.scheme == 'https':
                    href = href
                elif result.scheme == '':
                    href = result.path + (('#' + result.fragment) if result.fragment != '' else '')
                print 'to: ', href
  
                new_href = re.sub(r'(rss/index\.html)|(rss/?)$', 'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #3
0
def parseProductPage(product, need_img_urls=False):
    """进入商品详情页, 抓取四个新字段
       delivery reviews star total_sales
    """
    if product['product_url']:
       content = fetchContent(product['product_url'], False)
       doc=PyQuery(content)
       #product['delivery'] = doc("div.cost-entries-type > p > em.value").text() 运费JS动态 解决不了
       product['reviews'] = doc('p.satisfaction-number > a > em.value').text()
       product['star'] = doc('p.star-level > i').attr("class")
       product['total_sales'] = doc('p.bargain-number > a > em.value').text()
       if need_img_urls:
           url_list = get_img_urls(content)
           product['img_urls'] = ', '.join(url_list)
       else:
           product['img_urls'] = ''
       product['color'], product['size'] = '', ''
       for index, td in enumerate(doc('div.obj-content > table > tbody > tr > td')):
            tdQ = PyQuery(td)
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'颜色':
                product['color'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
            if tdQ.attr('class') =='de-feature' and tdQ.text().strip() == u'尺寸':
                product['size'] = PyQuery(doc('div.obj-content > table > tbody > tr > td')[index+1]).text()
       product['MOQ'] = extractNum(doc('tr.amount > td.ladder-1-1 > span.value').text().replace(u"≥", ""))
       if not product['MOQ'] or product['MOQ'] == 0:
           product['MOQ'] = extractNum(PyQuery(doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
       if product['MOQ'] == 1:
           #print product['product_url']
           product['sku_size'] = PyQuery(doc('div.unit-detail-spec-operator').eq(0))('span.text').text()
           product['sku_color'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.name').text()
           product['sku_price'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.price').text()
           product['sku_amount'] = PyQuery(doc('table.table-sku > tr').eq(0))('td.count > span > em.value').text()
           print product['sku_id'], '\t', product['sku_size'], "\t", product['sku_color'], "\t", product['sku_price'], "\t", product['sku_amount']
    return product
Пример #4
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
Пример #5
0
    def test_calendar_tag_rendering(self, timezone_mock):
        timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12)
        page_with_apphook = self.create_base_pages()
        other_config = EventsConfig.objects.create(namespace='other')
        self.create_event(
            title='ev1',
            start_date=tz_datetime(2015, 1, 13),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev2',
            start_date=tz_datetime(2015, 1, 15),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            de=dict(
                title='ev3',
                start_date=tz_datetime(2015, 1, 16),
                publish_at=tz_datetime(2015, 1, 10)
            )
        )
        self.create_event(
            title='ev4',
            start_date=tz_datetime(2015, 1, 18),
            publish_at=tz_datetime(2015, 1, 10),
            app_config=other_config
        )
        self.create_event(
            title='ev5',
            start_date=tz_datetime(2015, 1, 22),
            end_date=tz_datetime(2015, 1, 27),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev6',
            start_date=tz_datetime(2015, 1, 25),
        )
        # make use of default tests self.app_config namespace, instead of
        # hard coding it
        template_str = """
        {%% load aldryn_events %%}
        {%% calendar 2015 1 'en' '%s' %%}
        """ % self.app_config.namespace
        t = Template(template_str)
        with override('en'):
            html = t.render(SekizaiContext({}))
            table = PyQuery(html)('table.table-calendar')
            page_url_en = page_with_apphook.get_absolute_url()
        links = table.find('td.events, td.multiday-events').find('a')

        # test if tag rendered important elements
        self.assertEqual('1', table.attr('data-month-numeric'), )
        self.assertEqual('2015', table.attr('data-year'))
        self.assertEqual('10', table.find('td.today').text())
        self.assertEqual(8, links.length)  # 13, 15, 22, 23, 24, 25, 26, 27
        expected_days = (13, 15, 22, 23, 24, 25, 26, 27)
        for position, day in enumerate(expected_days):
            event_url = '{0}2015/1/{1}/'.format(page_url_en, day)
            rendered_url = links[position].attrib['href']
            self.assertEqual(event_url, rendered_url)
Пример #6
0
	def getTweets(tweetCriteria):
		refreshCursor = ''
	
		results = []
	
		while True:
			json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor)
			refreshCursor = json['min_position']
			try:
				tweets = PyQuery(json['items_html'])('div.js-stream-tweet')
			except Exception, e:
				print e
				# There was either an error in the request or nothing returned
				return results
			
			
			if len(tweets) == 0:
				break
			
			for tweetHTML in tweets:
				tweetPQ = PyQuery(tweetHTML)
				tweet = models.Tweet()

				# print tweetPQ("p.js-tweet-text").text()
				
				usernameTweet = tweetPQ("span.username.js-action-profile-name b").text()
				txt = re.sub(r"[^\x00-\x7F]", "", tweetPQ("p.js-tweet-text").text()) \
					.replace('# ', '#') \
					.replace('@ ', '@') \
					.replace('www. ', 'www.') \
					.replace('/ ', '/')

				retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
				favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
				dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
				id = tweetPQ.attr("data-tweet-id")
				permalink = tweetPQ.attr("data-permalink-path")
				
				geo = ''
				geoSpan = tweetPQ('span.Tweet-geo')
				if len(geoSpan) > 0:
					geo = geoSpan.attr('title')
				
				tweet.id = id
				tweet.permalink = 'https://twitter.com' + permalink
				tweet.username = usernameTweet
				tweet.text = txt
				tweet.date = datetime.datetime.fromtimestamp(dateSec)
				tweet.retweets = retweets
				tweet.favorites = favorites
				tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text))
				tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text))
				tweet.geo = geo
				
				results.append(tweet)
				
				if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets:
					return results
Пример #7
0
 def _add(self, k, el):
     """Parse element, without considering children."""
     el = Pq(el)
     id, classes = el.attr('id'), el.attr('class')
     if id is not None:
         self.selectors['ids'].add(id)
     if classes is not None:
         for _class in classes.split(' '):
             self.selectors['classes'].add(_class.strip())
Пример #8
0
    def _absoluteurl(x):
        q = PyQuery(this)
        href = q.attr('href')
        if href and (href.startswith('#') or href.startswith('http') or
            href.startswith('ftp')):
            return

        if href:
            q.attr('href','/' + href)
Пример #9
0
    def __processInstagramTag(self, i, e):
        obj = PyQuery(e)
        url = obj('a').attr('href')
        shortCode = re.match("http://.*/p/(.*)/", url).group(1)
        imageUrl = self.getInstagramImageUrl(shortCode)

        newObj = PyQuery("<img />")
        newObj.attr('src', imageUrl)
        obj.replaceWith(newObj)
Пример #10
0
 def replace_img(index, node):
     node = PyQuery(node)
     if not node.attr('src'):
         return node
     try:
         node.attr('src', urljoin_rfc(base_url, node.attr('src')))
     except:
         pass
     return node
Пример #11
0
 def fixLinks(text):
     d = PyQuery(text, parser='html')
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     return d.__unicode__().encode('utf8')
Пример #12
0
def scrape_category (url, title):
    category_slug = slugify (title)

    try:
        f = urlopen (url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace (' ','%20')
        if trace: print 'As:', url
        f = urlopen (url)

    doc = html5lib.parse(f, treebuilder='lxml', namespaceHTMLElements=False)  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html (doc)
    jQuery = PyQuery([doc])

    page_title =  jQuery ('title').text()

    if page_title.startswith ("eRacks Open Source Systems: "):
        page_title = page_title.partition ("eRacks Open Source Systems: ") [-1]

    if page_title.startswith ("eRacks "):
        page_title = page_title.partition ("eRacks ") [-1]

    content = jQuery ('td#content')
    links = content ('a')
    images = content ('img')

    for link in links:
        a = PyQuery (link)
        href = a.attr('href')
        skus = find_sku.findall (href)

        if skus:
            sku = skus [0]
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))
        elif href.startswith ('/Legacy'):
            sku = slugify (href.split ('/') [-1])
            #a.attr ('href', '/%s/%s/' % (category_slug, slugify (sku)))
            a.attr ('href', '/products/%s/%s/' % (category_slug, sku))

        print 'link:', a.attr('href')

    for image in images:
        img = PyQuery (image)
        src = img.attr('src')
        newsrc = getimage (src, 'categories/' + category_slug)
        img.attr ('src', newsrc)
        print 'image:', newsrc

    description = content.html()
    if trace: print description

    if dbteeth:
        cat = Categories.objects.get (name=title)
        cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(datetime.date.today())
        cat.description = description
        cat.title = page_title
        cat.save()
        print '..saved.'
Пример #13
0
def make_possible_feed(link_element):
    """ Visits each <link rel="alternate" href="http://..." /> element """
    link = PyQuery(link_element)
    title = 'Unknown'
    if link.attr('title'):
        title = link.attr('title')
    if link.attr('href'):        
        return {'feed_url': link.attr('href'), 'feed_title': title}
    else:
        log.info("Skipping malformed link element for feed, missing href")
        return False
Пример #14
0
        def replace_link(index, node):
            node = PyQuery(node)
            if not node.attr('href'):
                return node

            link = node.attr('href').strip()
            if regex.match(link):
                try:
                    node.attr('href', urljoin_rfc(base_url, link))
                except:
                    pass
            return node
Пример #15
0
 def _append_contents(struct, par):
     tag = struct['tag']
     _node = PyQuery('<%s />' % tag)
     if 'attributes' in struct:
         for key in struct['attributes'].keys():
             _node.attr(key, struct['attributes'][key])
     if 'text' in struct:
         _node.text(struct['text'])
     elif 'children' in struct:
         for (ugh, child) in struct['children'].iteritems():
             _append_contents(child, _node)
     par.append(_node)
Пример #16
0
 def fixLinks(text, parser):
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for element in d('a'):
         e = PyQuery(element)
         href = e.attr('href')
         if not abs_url_regex.search(href):
             new_href = re.sub(r'/index\.html$', '/', href)
             new_href = re.sub(r'index.html', '/', new_href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #17
0
    def parseProductDetails(self, product_page_content, product_info):
        doc = PyQuery(product_page_content)
        product_info['reviews'] = doc('a.Rating_div > b').text()
#         product_info['facebook_likes'] = doc('span#u_0_2 > span.pluginCountTextDisconnected').text().strip()
#         product_info['tweet_share'] = doc('div#c > a#count').text().strip()
        product_info['likes'] = self.crawler.fetchSocialLikes(product_info['product_url'])
        imgNodeList = doc('div.other_Imgs > a > div.otheImg_li > img') #div.otheImg_li下存在2个img(重复)
        urls = []
        for imgNode in imgNodeList:
            imgNodeQ = PyQuery(imgNode)
            if imgNodeQ.attr('bigimg'):
                urls.append(imgNodeQ.attr('bigimg'))
        product_info['img_urls'] = ', '.join(urls)
Пример #18
0
	def replace_image(self, target, image_name):
		elements = self.html_obj('*').filter('[dzid="' + target + '"]')  
		location = self.location + urllib.quote_plus(image_name)

		for e in elements:
			pq = PyQuery(e)
			if pq.eq(0).is_('img'):
				pq.attr('src', location)
			else:
				pq.css('background-image', 'url("' + location + '");')

			return location

		return None
Пример #19
0
def process_place(link, marker):
#    print link
    response = urllib2.urlopen(link)
    page = PyQuery(response.read())

    post_body = page('.post_body')
    marker['title'] = post_body('h1').text().encode('utf-8')
    marker['icon'] = post_body('img:first').attr('src')
    addresses = post_body('td:eq(1)').html().split('<br/>')
    marker['address'] = PyQuery(addresses[0]).text().encode('utf-8')[len(ADDRESS):]
    process_position(PyQuery(addresses[1]).text().encode('utf-8'), marker)
    marker['objects'][0]['phone'] = post_body('td:eq(2)').text().encode('utf-8')[len(PHONES):]
    marker['objects'][0]['time'] = post_body('td:eq(3)').text().encode('utf-8')[len(WORK_TIME):]
    try:
        site = post_body('a.inv').attr('href')
        if URL in site:
            site = site[len(URL):]
        marker['site'] = site
    except Exception as ex:
        marker['site'] = ''
        print 'Error on site getting: %s' % link

    coment_details = page('.coment_details')

    beers = []
    for element in coment_details('.coment_content>div:eq(0)')('td:odd')('a'):
        beer = PyQuery(element)
        beer_name = beer.text().encode('utf-8')
        beer_link = beer.attr('href')
        beers.append({'name': beer_name, 'link': beer_link})
    marker['objects'][0]['beers'] = beers

    beer_countries = []
    for element in coment_details('.coment_content>div:eq(1)')('td:odd')('a'):
        beer_country = PyQuery(element)
        beer_country_name = beer_country.text().encode('utf-8')
        beer_country_link = beer_country.attr('href')
        beer_countries.append({'name': beer_country_name, 'link': beer_country_link})
    marker['objects'][0]['beerCountries'] = beer_countries

    beer_sorts = []
    for element in coment_details('.coment_content>p:eq(2)')('a'):
        beer_sort = PyQuery(element)
        beer_sort_name = beer_sort.text().encode('utf-8')
        beer_sort_link = beer_sort.attr('href')
        beer_sorts.append({'name': beer_sort_name, 'link': beer_sort_link})
    marker['objects'][0]['beerSorts'] = beer_sorts

    return marker
Пример #20
0
 def fix_share_links(text,parser):
     td_regex = re.compile(target_domain + '|' )
     
     assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
     d = PyQuery(bytes(bytearray(text, encoding='utf-8')), parser=parser)
     for share_class in ['.icon-twitter','.icon-facebook','.icon-google-plus']:
         for element in d(share_class):
             e = PyQuery(element)
             href = e.attr('href')
             new_href = re.sub(domain, target_domain, href)
             e.attr('href', new_href)
             print "\t", href, "=>", new_href
     if parser == 'html':
         return d.html(method='html').encode('utf8')
     return d.__unicode__().encode('utf8')
Пример #21
0
def get_old_fashion_comments(answer_url):
  aid = comment_list_id(answer_url)
  comment_box_link = 'http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22{}%22%2C%22load_all%22%3Atrue%7D'.format(aid)  # | log
  # log('comments: ' + comment_box_link)
  r = old_client._session.get(comment_box_link)
  # print(str(r.content))
  doc = PyQuery(str(r.content, encoding='utf-8'))
  comments = []
  for div in doc.find('div.zm-item-comment'):
    div = PyQuery(div)
    cid = div.attr('data-id')
    vote_count = int(div.find('span.like-num').find('em').text())
    content = div.find('div.zm-comment-content').html()
    author_text = div.find('div.zm-comment-hd').text().replace('\n', ' ')
    if ' 回复 ' in author_text:
      author, reply_to = author_text.split(' 回复 ')
    else:
      author, reply_to = author_text, None

    comment = OldFashionComment(cid=cid,
                                vote_count=vote_count,
                                content=content,
                                author=OldFashionAuthor(author),
                                reply_to=OldFashionAuthor(reply_to) if reply_to else None)
    comments.append(comment)
  return comments
Пример #22
0
 def parseProductsByCategory(self, category_page_content, category_info):
     self.num_idx = 0
     if self.current_category == category_info:
         self.page_idx = self.page_idx + 1
     else:
         self.current_category = category_info
         self.page_idx = 1
     doc = PyQuery(category_page_content)
     productNodeList = doc('div#productsContent1_goods > div')
     productList = []
     for productNode in productNodeList:
         productNodeQ = PyQuery(productNode)
         self.num_idx = self.num_idx + 1
         productInfo = self.newProduct()
         productInfo['sku_id'] = productNodeQ.attr('alt1')
         productInfo['name'] = productNodeQ('div.goods_mz > a').text().strip()
         productInfo['product_url'] = productNodeQ('div.goods_mz > a').attr('href')
         productInfo['img_url'] = productNodeQ('div.goods_aImg > a > img').attr('src')
         productInfo['price'] = productNodeQ('div#cat-product-list_USD > span.special_price').attr('price')
         productInfo['original_price'] = productNodeQ('div#cat-product-list_USD > span.shop_price').attr('price')
         productInfo['page_idx'] = str(self.page_idx)
         productInfo['num_idx'] = str(self.num_idx)
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Пример #23
0
    def next(self):
        if self.i == self.categories_iter.length:
            raise StopIteration

        link = self.categories_iter[self.i]

        py_link = PyQuery(link)
        href = py_link.attr('href')
        html_class = href.split('/')[-1:][0]
        title = py_link.text()
        thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src')
        url = self.crawler.category_url % href

        category = Category(title, url, html_class, thumbnail_url)
        shows = Shows(self.crawler, url)

        tmp = list()
        tmp.append(shows)

        if title == 'Nyheter':
            news_url = self.crawler.news_url % href
            news_shows = Shows(self.crawler, news_url)
            tmp.append(news_shows)

        category.shows = itertools.chain(*tmp)

        self.i += 1
        return category
Пример #24
0
    def get_subforums_infos(self, html):
        """
        Get informations (description, number of topics and posts, ...) about
        the forums listed on a page
        """
        document = PyQuery(html)

        idpattern = re.compile(r"/([fc]\d+)-.*")

        for element in document("a.forumlink"):
            e = PyQuery(element)

            match = idpattern.fullmatch(clean_url(e.attr("href")))
            if not match:
                continue

            oldid = match.group(1)

            row = e.closest("tr")

            # Get forum status
            alt = row("td:nth-of-type(1) img").eq(0).attr("alt")
            self.forums[oldid].status = 1 if "verrouillé" in alt else 0

            # Get subforum description
            self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or ""

            # TODO : Get subforum icon

            # Get subforum numbers of topics and posts
            self.forums[oldid].num_topics = int(row("td").eq(2).text())
            self.forums[oldid].num_posts = int(row("td").eq(3).text())
Пример #25
0
def find_external_links(url):
    '''Look for links to files in a web page and returns a set.
    '''
    links = set()
    try:
        response = get(url)
        if response.status_code != 200:
            app.logger.warning('Error while getting proxy info for: %s'
                               'Errors details: %s', url,
                               response.text)
        else:
            if response.content:
                p = PyQuery(response.content)
                for anchor in p("a"):
                    panchor = PyQuery(anchor)
                    href = panchor.attr("href")
                    if url_is_egg_file(href):
                        # href points to a filename
                        href = get_absolute_url(href, url)
                        links.add('<a href="%s">%s</a>' % (href, panchor.text()))
    except:
        # something happened when looking for external links: 
        #       timeout, HTML parser error, etc.
        # we must not fail and only log the error
        app.logger.exception('')
    return links
Пример #26
0
def _main():
    # u'<title>':'<url>' sets
    mt_pages = {}
    wp_pages = {}

    # MT
    request = requests.get(MT_ARCHIVES_URL)
    document = PyQuery(request.content);
    archive_list = document('#pagebody .archive-list a')
    for archive in archive_list:
        archive = PyQuery(archive)
        mt_pages[archive.text()] = archive.attr('href')

    # WP
    fh = open(WP_ARCHIVES_FILE_PATH, 'r')
    document = PyQuery(fh.read(), parser='xml');
    items = document('channel item')
    for item in items:
        item = PyQuery(item)
        wp_pages[item('title').text()] = item('link').text()

    # Create .htaccess
    fh = open(BASE_DIR + '/tmp/.htaccess', 'a')
    for title, href in mt_pages.items():
        if title in wp_pages:
            fh.write('Redirect permanent %s %s\n' % (
                re.sub(r'http://kjirou\.sakura\.ne\.jp', '', href),
                wp_pages[title],
            ))
    fh.write('Redirect permanent /mt/index.xml http://blog.kjirou.net/feed\n')
    fh.write('Redirect permanent /mt/atom.xml http://blog.kjirou.net/feed\n')
    fh.write('Redirect permanent /mt/archives.html http://blog.kjirou.net\n')
    fh.write('Redirect permanent /mt http://blog.kjirou.net\n')
def getPageLinkIfValid(element, currentPageNumber):
    pyElement = PyQuery(element)
    pageNumberText = pyElement.find('span').text()

    if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber:
        return 'https://www.youtube.com' + pyElement.attr('href')
    return None
Пример #28
0
def parse(html):
    '''return a list of dictionaries describing the stories on the front page'''
    elements = []
    p = PyQuery(html)
    # 90s markup woohoo!
    anchors = p('.title:nth-child(3) a:nth-child(1)')
    for a in anchors:
        # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated
        a = PyQuery(a)
        subtext = a.closest('tr').next().find('.subtext')
        if not subtext:
            # More link
            continue
        children = map(PyQuery, subtext.children())
        try:
            span, submitted, comments = children[0], children[1], children[-1]
        except IndexError:
            # filter out ads
            continue
        comments = comments.text().rpartition(' ')[0]
        comments = int(comments) if comments else 0
        url = a.attr('href')
        elements.append({
                      'pos': len(elements) + 1,
                    'title': a.text(),
                      'url': url,
                   'domain': urlparse(url).netloc.rpartition('www.')[2],
                 'comments': comments,
                'submitter': submitted.text(),
                   'points': int(span.text().split()[0]),
                       'id': int(span.attr('id').split('_', 1)[1]),
                      'ago': submitted[0].tail.split('ago')[0].strip(),
                })
    logging.warning('parsed %s elements', len(elements))
    return elements
Пример #29
0
    def __extract(self, html):
        pq = PyQuery(html).find("main#main #mainArea table")

        selector_ = "thead tr:eq(0) th"
        date_order = [PyQuery(v).text().split('\n')[0] for v in PyQuery(pq).find(selector_)][3:]
        result = {d: {} for d in date_order}

        index = 0
        total = len(PyQuery(pq).find("tbody tr"))
        while index < total:
            td = PyQuery(pq).find("tbody tr:eq(%d) td:eq(0)" % index)

            room_type = td.text().split()[0]
            rowspan = int(td.attr('rowspan'))

            for i in xrange(index, index + rowspan):
                row = PyQuery(pq).find("tbody tr:eq(%d)" % i)

                # smoking or not
                smoking = PyQuery(row).find("td.alC.alM > img").attr("alt")

                room = "%s (%s)" % (room_type, smoking)

                if row.hasClass('clubCardCell'):
                    member_type = 'member'
                else:
                    member_type = 'guest'

                for i, v in enumerate(self.__extract_price_remain(row)):
                    if room not in result[date_order[i]]:
                        result[date_order[i]][room] = {}
                    result[date_order[i]][room][member_type] = v

            index += rowspan
        return result
Пример #30
0
def parsePage(content):
    doc = PyQuery(content)
    productNodeList = doc("ul#sm-offer-list > li")
    productList = []
    for node in productNodeList:
        nodeQ = PyQuery(node)
        p = Product()
        p["product_name"] = nodeQ('a[offer-stat="title"]').text()
        url = nodeQ('a[offer-stat="title"]').attr("href")
        if url.find("http") == 0:
            p["product_url"] = url
        else:
            p["product_url"] = "http:" + url
        p["product_price"] = nodeQ("span.sm-offer-priceNum").text()
        p["img_url"] = nodeQ('a[offer-stat="pic"] > img').attr("src")
        p["sku_id"] = nodeQ.attr("t-offer-id")

        p["store_name"] = nodeQ("a.sm-offer-companyName").text()
        p["store_url"] = nodeQ("a.sm-offer-companyName").attr("href")
        print p["store_url"]
        p["tags"] = []
        aList = nodeQ("div.sm-offer-subicon > a")
        for a in aList:
            s = PyQuery(a).attr("class")
            if s:
                p["tags"].append(s)
        p["tags"] = ", ".join(p["tags"])
    #         parseProductPage(p, True)
    #         parseStorePage(p)
    #         productList.append(p)
    # return productList #测试
    return productList
Пример #31
0
def extract_links(page):
	d = PyQuery(page)
	links = d('.newstitle>a')
	entries = []

	for link in links:
		link = PyQuery(link)
		# get title and link from html
		title = link.text().encode("utf-8")
		link = link.attr('href')
		entries.append((title, link))

	return entries
Пример #32
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue

                new_href = re.sub(r'(rss/index\.html)|((?<!\.)rss/?)$',
                                  'rss/index.rss', href)
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'/index\.html$', '/', new_href)

                if href != new_href:
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href

            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #33
0
 def run(self):
     # 获取商品详情url
     try:
         pq = helper.get(self.url, myHeaders=self.headers)
         for span in pq('span.product_title'):
             a = PyQuery(span).parents('a')
             self.q.put(a.attr('href'))
     except:
         helper.log('[ERROR] => ' + self.url, 'eastbay')
         self.error_page_url_queue.put({
             'url': self.url,
             'gender': self.gender
         })
Пример #34
0
        def fix_share_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in ['.share_links a']:
                print "share_class : ", share_class
                for element in d(share_class):
                    e = PyQuery(element)
                    print "element : ", e
                    href = e.attr('href')
                    print "href : ", href
                    print "domain : ", domain
                    print "target_domain : ", target_domain
                    new_href = re.sub(domain, target_domain, href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #35
0
        def fix_meta_image_links(text, parser):
            filetext = text.decode('utf8')
            td_regex = re.compile(target_domain + '|')

            assert target_domain, "target domain must be specified --target_domain=<http://your-host-url>"
            d = PyQuery(bytes(bytearray(filetext, encoding='utf-8')),
                        parser=parser)
            for share_class in [
                    'meta[property="og:image"]', 'meta[name="twitter:image"]'
            ]:
                print "share_class : ", share_class
                for element in d(share_class):
                    e = PyQuery(element)
                    href = e.attr('content')
                    content_target_domain = target_domain.replace(
                        "/static", "")
                    new_href = re.sub(domain, content_target_domain, href)
                    e.attr('content', new_href)
                    print "\t fix image link", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #36
0
        def fixLinks(text, parser):
            #extremely lazy implementation - beware.
            text = text.replace('pngg', 'png')
            text = text.replace('pngng', 'png')
            text = text.replace('pngpng', 'png')

            text = text.replace('PNGG', 'PNG')
            text = text.replace('PNGNG', 'PNG')
            text = text.replace('PNGPNG', 'PNG')

            text = text.replace('jpgg', 'jpg')
            text = text.replace('jpgpg', 'jpg')
            text = text.replace('jpgjpg', 'jpg')

            text = text.replace('jpegg', 'jpeg')
            text = text.replace('jpegeg', 'jpeg')
            text = text.replace('jpegpeg', 'jpeg')

            text = text.replace('http://localhost:2368/',
                                'https://blog.lucaperic.com/')
            text = text.replace(
                'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/',
                'https://feedly.com/i/subscription/feed/https://blog.lucaperic.com/rss/index.rss'
            )
            text = text.replace('/author/luca/rss/', '/rss/index.rss')
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a'):
                e = PyQuery(element)
                href = e.attr('href')
                if not abs_url_regex.search(href):
                    new_href = re.sub(r'rss/index\.html$', 'rss/index.rss',
                                      href)
                    new_href = re.sub(r'/index\.html$', '/', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return d.html(method='html').encode('utf8')
            return d.__unicode__().encode('utf8')
Пример #37
0
    def _process(self, data, pid):
        if '/product/' not in data._resp.effective_url:
            return
        props = dict([
            PQ(i).text().split(u':') for i in data('.detail-tab-pro-info li')
        ])
        print props

        try:
            categories = [{
                re.findall('\d+',
                           PQ(i).attr('href'))[0]:
                PQ(i).text()
            } for i in data('.breadcrumbs a')[1:]]
        except:
            categories = []

        ret = {
            'title':
            data('#sec_productTitle').text(),
            'category_id':
            data('#hid_categoryId').attr('value'),
            'category_tree':
            categories,
            'keywords':
            data('meta[name=Keywords]').attr('content').split(u','),
            'property':
            props,
            'image':
            'http://www.carrefour.cn%s' % data('li.select img').attr('bimg')
        }

        # Brand
        if u'品牌' in props:
            brand = props[u'品牌']

            left_columns = data('.middle-left01')
            target = None
            for column in left_columns:
                if u'相关品牌' == PQ(column)('p.left-title').text():
                    target = PQ(column)

            if target:
                foo = PQ(target('a')[0])
                url = foo.attr('href')
                brand_id = re.findall('b=(\d+)', url)[0]
                brand_text = foo.text()
                if brand_text == brand:
                    ret['brand'] = {'brand_name': brand, 'brand_id': brand_id}

        self.save(ret)
Пример #38
0
        def fixLinks(text, parser):
            d = PyQuery(bytes(bytearray(text, encoding='utf-8')),
                        parser=parser)
            for element in d('a, link'):
                e = PyQuery(element)
                href = e.attr('href')

                if href is None:
                    continue
                if (not abs_url_regex.search(href)) or ('/rss/' in href):
                    new_href = re.sub(r'rss/$', 'feed.rss', href)
                    new_href = re.sub(r'index\.html$', '', new_href)
                    new_href = re.sub(r'index\.html\#$', '', new_href)
                    e.attr('href', new_href)
                    print "\t", href, "=>", new_href
            if parser == 'html':
                return "<!DOCTYPE html>\n<html>" + d.html(
                    method='html').encode('utf8') + "</html>"
            elif parser == 'xml':
                return "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + d.__unicode__(
                ).encode('utf8')
            return "<!DOCTYPE html>\n<html>" + d.__unicode__().encode(
                'utf8') + "</html>"
Пример #39
0
    def test_css_classes(self):
        viewlet = self.get_viewlet(self.portal)

        registry = getUtility(IRegistry)
        proxy = registry.forInterface(IFooterSettings)

        proxy.columns_count = 2

        doc = PyQuery(viewlet.render())
        footer = doc('#ftw-footer')
        child = footer.children()[0]
        child = PyQuery(child)

        if IS_PLONE_5:
            self.assertEqual(child.attr('class'), 'col-lg-6')
        else:
            self.assertEqual(child.attr('class'),
                             'column cell position-0 width-8')

            child = footer.children()[1]
            child = PyQuery(child)
            self.assertEqual(child.attr('class'),
                             'column cell position-8 width-8')
Пример #40
0
 def parseNextPageUrl(self, category_page_content):
     doc = PyQuery(category_page_content)
     nodeList = doc('p.listspan').children('span > a') #debug得到的结果
     if not nodeList:
         nodeList = doc('p.listspan').children('a') #fw保存下来是这种格式
     url = None
     for node in nodeList:
         nodeQ = PyQuery(node)
         if nodeQ.text().strip() == '>':
             url = nodeQ.attr('href')
             break
     if url:
         print self.merchant.filteruri(url)
         return self.merchant.filteruri(url)
Пример #41
0
    def _lead_art_element(self):
        art_elements = self.element.find('layout').find('storytext').children()

        if not len(art_elements):
            return None

        el = PyQuery(art_elements[0])

        if el[0].tag != 'image':
            return None

        image_id = el.attr('refId')

        return PyQuery(self.element.children('image[id="%s"]' % image_id))
Пример #42
0
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict:
    # Find if has children
    elem = PyQuery(elem)
    children = list(elem.contents())
    has_children = len(elem.children()) > 0

    contents = []
    if has_children:
        # Fix unwrapped children
        if not already_wrapped:
            children = fix_unwrapped_text(elem).contents()

        for child in children:
            child_dict = build_dict_from_sane_json(child, already_wrapped=True)
            if child_dict:
                contents.append(child_dict)
    else:
        contents = elem.html()

    extra = {}

    # Only tables need the HTML (to use later for extraction of relevant data)
    if elem.is_("table"):
        extra = {'original_html': str(elem)}

    if 'src' in elem[0].attrib:
        extra['src'] = elem.attr('src')
    if 'href' in elem[0].attrib:
        extra['href'] = elem.attr('href')

    return {
        'type': list(elem)[0].tag,
        'attrs': [],
        'layout': {},
        'contents': contents,
        'extra': extra
    }
Пример #43
0
def scrape_category(url, title):
    category_slug = slugify(title)

    try:
        f = urlopen(url)
    except ValueError:
        if trace: print 'Retrying:', url
        url = 'http://eracks.com' + url.replace(' ', '%20')
        if trace: print 'As:', url
        f = urlopen(url)

    doc = html5lib.parse(
        f, treebuilder='lxml'
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    content = jQuery('td#content')
    #description = jQuery ('td#content').html()
    links = content('a')
    images = content('img')

    for link in links:
        a = PyQuery(link)
        href = a.attr('href')
        skus = find_sku.findall(href)

        if skus:
            sku = skus[0]
            a.attr('href', '/%s/%s/' % (category_slug, slugify(sku)))
        elif href.startswith('Legacy'):
            sku = slugify(href.split('/')[-1])

        print 'link:', a.attr('href')

    for image in images:
        img = PyQuery(image)
        src = img.attr('src')
        newsrc = getimage(src, 'categories/' + category_slug)
        img.attr('src', newsrc)
        print 'image:', newsrc

    description = content.html()
    if trace: print description

    if dbteeth:
        cat = Categories.objects.get(name=title)
        cat.comments = cat.comments + '\n\nScraped from Zope as of ' + str(
            datetime.date.today())
        cat.description = description
        cat.save()
        print '..saved.'
Пример #44
0
def get_one_page_album(account_id, page_count):
    # http://bcy.net/u/50220/post/cos?&p=1
    album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id
    query_data = {"p": page_count}
    album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data)
    result = {
        "album_info_list": [],  # 全部作品信息
        "coser_id": None,  # coser id
        "is_over": False,  # 是不是最后一页作品
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status))
    if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0:
        raise crawler.CrawlerException("账号不存在")
    # 获取coser id
    coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data)
    if len(coser_id_find) != 1:
        raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data)
    if not crawler.is_integer(coser_id_find[0]):
        raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data)
    result["coser_id"] = coser_id_find[0]
    # 获取作品信息
    album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item")
    for album_index in range(0, album_list_selector.size()):
        album_selector = album_list_selector.eq(album_index)
        result_album_info = {
            "album_id": None,  # 作品id
            "album_title": None,  # 作品标题
        }
        # 获取作品id
        album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href")
        if not album_url:
            raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = str(album_url).split("/")[-1]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8")))
        result_album_info['album_id'] = album_id
        # 获取作品标题
        album_title = album_selector.find(".postWorkCard__img img").attr("alt")
        result_album_info["album_title"] = str(album_title.encode("UTF-8"))
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a")
    if last_pagination_selector.size() == 1:
        max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1])
        result["is_over"] = page_count >= max_page_count
    else:
        result["is_over"] = True
    return result
Пример #45
0
    def getHashtagsAndMentions(tweetPQ):
        """Given a PyQuery instance of a tweet (tweetPQ) getHashtagsAndMentions
        gets the hashtags and mentions from a tweet using the tweet's
        anchor tags rather than parsing a tweet's text for words begining
        with '#'s and '@'s. All hashtags are wrapped in anchor tags with an href
        attribute of the form '/hashtag/{hashtag name}?...' and all mentions are
        wrapped in anchor tags with an href attribute of the form '/{mentioned username}'.
        """
        anchorTags = tweetPQ("p.js-tweet-text")("a")
        hashtags = []
        mentions = []
        isFirstHT = True
        firstHT = ''
        for tag in anchorTags:
            tagPQ = PyQuery(tag)
            url = tagPQ.attr("href")
            if url is None or len(url) == 0 or url[0] != "/":
                continue

            # Mention anchor tags have a data-mentioned-user-id
            # attribute.
            if not tagPQ.attr("data-mentioned-user-id") is None:
                mentions.append("@" + url[1:])
                continue

            hashtagMatch = re.match('/hashtag/\w+', url)
            if hashtagMatch is None:
                continue

            hashtag = hashtagMatch.group().replace("/hashtag/", "#")
            if isFirstHT:
                firstHT = hashtag
                isFirstHT = False
            hashtags.append(hashtag)

        return (" ".join(hashtags), " ".join(mentions), firstHT)
Пример #46
0
 def parseProductDetails(self, product_page_content, product_info):
     doc = PyQuery(product_page_content)
     product_info['price'] = re.sub(
         '\s', '',
         doc('span[class="product_price emphasis "]').text())
     #从下面获取的是描述图的小图
     #imgNodeList = doc('div[class="js-carousel-content car_content"] > div > img')
     imgNodeList = doc('div.js-slider-container > div > a > img')
     results = []
     for node in imgNodeList:
         nodeQ = PyQuery(node)
         url = nodeQ.attr('src')
         if url:
             results.append(url)
     product_info['img_urls'] = ', '.join(results)
Пример #47
0
def tweetPaser(tweets_html):
  tweetslist = []
  if tweets_html.strip() != '':
    scraped_tweets = PyQuery(tweets_html)
    scraped_tweets.remove('div.withheld-tweet')
    tweets = scraped_tweets('div.js-stream-tweet')
    if len(tweets) != 0:
      for tweet_html in tweets:
        t = {}
        tweetPQ = PyQuery(tweet_html)
        t['user'] = tweetPQ("span:first.username.u-dir b").text()
        txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text())
        txt = txt.replace('# ', '#')
        txt = txt.replace('@ ', '@')
        t['tweet'] = txt
        t['id'] = tweetPQ.attr("data-tweet-id")
        t['retweets'] = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
        t['favorites'] = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", ""))
        t['link'] = 'https://twitter.com' + tweetPQ.attr("data-permalink-path")
        t['mentions'] = re.compile('(@\\w+)').findall(t['tweet'])
        t['hashtags'] = re.compile('(#\\w+)').findall(t['tweet'])
        t['timestamp'] = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time"))
        tweetslist.append(t)
  return tweetslist
Пример #48
0
 def test_calendar_tag_rendering_en_and_de(self, timezone_mock):
     timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12)
     page_with_apphook = self.create_base_pages(multilang=True)
     # make use of default tests self.app_config namespace, instead of
     # hard coding it
     t = self.get_template(self.app_config.namespace)
     context = self.get_context(page_with_apphook)
     with override('en'):
         html = t.render(SekizaiContext(context))
         table = PyQuery(html)('table.js-calendar-table')
         page_with_apphook.get_absolute_url()
     links = table.find('td.events, td.multiday-events').find('a')
     # test if tag rendered important elements
     self.assertEqual('1', table.attr('data-month-numeric'), )
     self.assertEqual('2015', table.attr('data-year'))
     self.assertEqual('10', table.find('td.today').text())
     # should include DE only event as well
     expected_days = (13, 14, 15, 16, 17, 22, 23, 24, 25, 26, 27)
     for position, day in enumerate(expected_days):
         # page url may vary depending on fallback settings, check only
         # against the date.
         event_url = '/2015/1/{0}/'.format(day)
         rendered_url = links[position].attrib['href']
         self.assertGreater(rendered_url.find(event_url), -1)
Пример #49
0
 def parseProductsByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productNodeList = doc('ul.ws-product-list:first > li.hproduct')
     productList = []
     for node in productNodeList:
         nodeQ = PyQuery(node)
         productInfo = self.newProduct()
         productInfo['name'] = nodeQ('h4.ws-product-title').text()
         productInfo['sku_id'] = nodeQ.attr('data-context-sku')
         productInfo['product_url'] = nodeQ('h4').parent('a').attr('href')
         productInfo['img_url'] = nodeQ('div.kor-product-photo > a > img').attr('src')
         productInfo['price'] = nodeQ('div.kor-product-sale-price > span.kor-product-sale-price-value').text()
         productInfo['likes'] = self.crawler.fetchSocialLikes(productInfo['product_url'])
         productInfo.set_categories(category_info)
         productList.append(productInfo)
     return productList
Пример #50
0
    def __processImageTag(self, i, e):
        obj = PyQuery(e)
        style = obj.attr('style')

        if style != None and style.find('display: none') != -1:
            obj.remove()
            return

        newObj = PyQuery("<img />")
        newObj.attr('src', obj.attr('rel:bf_image_src'))
        newObj.attr('style', obj.attr('style'))
        newObj.width(obj.width())
        newObj.height(obj.height())
        obj.replaceWith(newObj)
Пример #51
0
def getSonglist(playlistId):
    f = opener.open(urllib.request.Request(url_base.format(playlistId)))
    html = f.read().decode('utf-8')
    doc = PyQuery(html)
    songs = doc('#song-list-pre-cache ul li a')
    song_arr = []
    for song in songs:
        el = PyQuery(song)
        parser = urlparse(el.attr('href'))
        id = parse_qs(parser.query).get('id')[0]
        song_arr.append({
            'id': id,
            'url': url_download.format(id),
            'title': el.text()
        })
    return song_arr
    def get_tweet_ids(self, term, get_info):
        """
        Given a search term or search phrase, find all the IDs of the result
        tweets.
        """
        tweet_ids = []
        full_tweets = []

        refreshCursor = ''

        while True:

            response = self.getJsonReponse(term, refreshCursor)
            refreshCursor = response['min_position']

            try:
                tweets = PyQuery(response['items_html'])('div.js-stream-tweet')
            except Exception:
                break

            # Exit when no more tweets loaded
            if len(tweets) == 0:
                break

            for tweetHTML in tweets:
                tweetPQ = PyQuery(tweetHTML)
                tweet_id = tweetPQ.attr("data-tweet-id")
                tweet_ids.append(tweet_id)
                if get_info:
                    tweet_info = dict()
                    tweet_info['id'] = tweet_id
                    tweet_info['username'] = tweetPQ(
                        "span.username.js-action-profile-name b").text()
                    tweet_info['text'] = self.text_format(
                        re.sub(r"[^\x00-\x7F]", "",
                               tweetPQ("p.js-tweet-text").text()).replace(
                                   '# ', '#').replace('@ ', '@'))
                    tweet_info['date'] = int(
                        tweetPQ("small.time span.js-short-timestamp").attr(
                            "data-time"))
                    tweet_info['date'] = datetime.datetime.fromtimestamp(
                        tweet_info['date'])
                    full_tweets.append(tweet_info)
            if len(tweet_ids) > 700:
                break

        return tweet_ids, full_tweets
def scrape_top(fragment_str):
    global v1
    v1.append(fragment_str)
    # parse the content of the '.athing' class
    # getting the fields id,title,uri,rank
    s = PyQuery(fragment_str)
    post_id = s.attr('id')
    title = s(".storylink").text()[:256]
    title = "None" if not title else title
    uri = sanitize_url(URL, s(".storylink").attr("href"))
    try:
        rank = int(s(".rank").text()[:-1])
    except:
        rank = 0
    global v
    v.append([post_id, title, uri, rank])
    return post_id, title, uri, rank
Пример #54
0
 def fetch_note_comments(self, url, dom, douban_id):
     comments = []
     strip_username = lambda el: re.findall(
         r'^http(?:s?)://www\.douban\.com/people/(.+)/$', el.attr('href')
     ).pop(0)
     while True:
         comment_items = dom('#comments .comment-item')
         for comment_item in comment_items:
             item_div = PyQuery(comment_item)
             quote_user_link = item_div('.content>.reply-quote>.pubdate>a')
             if quote_user_link:
                 quote_user_name = quote_user_link.text()
                 quote_user_id = strip_username(quote_user_link)
                 quote_text = item_div('.content>.reply-quote>.all').text()
                 blockquote = '{0}({1}):{2}'.format(quote_user_name,
                                                    quote_user_id,
                                                    quote_text)
             else:
                 blockquote = None
             comments.append({
                 'douban_id':
                 item_div.attr('data-cid'),
                 'content':
                 item_div.outer_html(),
                 'target_type':
                 'note',
                 'target_douban_id':
                 douban_id,
                 'user':
                 self.fetch_user(strip_username(item_div('.pic>a'))),
                 'text':
                 item_div('.content>p').text(),
                 'created':
                 item_div('.content>.author>span').text(),
                 'quote':
                 blockquote,
             })
         next_page = dom('#comments>.paginator>.next>a')
         if next_page:
             url = next_page.attr('href')
         else:
             break
         response = self.fetch_url_content(url)
         dom = PyQuery(response.text)
     return comments
Пример #55
0
def parsecsdn(i):
    url = "https://blog.csdn.net/weiqifa0/article/list/" + str(i)
    result = requests.post(url)

    content = result.text
    # 把这个内容转换成PyQuery对象
    datas = PyQuery(content)
    items = datas(".article-list a")

    for item in items:
        obj = {}
        # print(item)
        lineObj = PyQuery(item)
        # print(lineObj)
        title = lineObj.text()
        link = lineObj.attr("href")

        print(title, link)
Пример #56
0
def generate_sitemap(sitemap_file_name, html_text, print_html):
    pq = PyQuery(html_text)
    sitemap_text = '<?xml version="1.0" encoding="utf-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
    for x in pq.find('a'):
        x = PyQuery(x)
        site_url = x.attr('href')
        sitemap_text += f'<url><loc>{site_url}</loc></url>\n'
    sitemap_text += '</urlset>'

    if print_html:
        print(f'--- {sitemap_file_name} ---')
        print(sitemap_text)

    file = open(sitemap_file_name, 'w', encoding='UTF-8')
    file.write(sitemap_text)
    file.close()

    return
Пример #57
0
    def next(self):
        if self.i == self.categories_iter.length:
            raise StopIteration

        link = self.categories_iter[self.i]

        py_link = PyQuery(link)
        href = py_link.attr('href')
        html_class = href.split('/')[-1:][0]
        title = py_link.text()
        # thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src')
        url = href

        show = Show(title, url, html_class)
        show.clips = Episodes(self.crawler, url)

        self.i += 1
        return show
Пример #58
0
 def parseCategories(self, homepage_content):
     categoryList = []
     doc = PyQuery(homepage_content)
     #只获取第一个一级品类的展开 Camp & Hike
     node = doc(
         'div[class="mega-menu-container container js-mega-menus-target"] > div > div > section'
     ).eq(0)
     nodeList = PyQuery(node).find(
         "div.row > div.col-xs-2 > a")  #a > h4是品类名  Accessories没有<a>忽略
     for node in nodeList:
         nodeQ = PyQuery(node)
         categoryInfo = self.newCategory()
         categoryInfo.name = nodeQ.text()
         categoryInfo.url = nodeQ.attr('href')
         self.process_url(categoryInfo)
         if categoryInfo.name and categoryInfo.url:
             categoryInfo.parent_categories = ['Camp & Hike']
             categoryList.append(categoryInfo.formalize())
     return categoryList
Пример #59
0
 def parseCategories(self, homepage_content):
     doc = PyQuery(homepage_content)
     nodeList = doc('ul#header-navigation-menu > li.menu-container')
     categoryList = []
     #去除前面三个和后面两个乱七八糟的分类
     validNodeList = nodeList[3:10]
     for node in validNodeList:
         nodeQ = PyQuery(node)
         level1Name = nodeQ.children('a').text()
         level2NodeList = nodeQ.children(
             'div > ul:first > li.indent-child > span')
         for level2Node in level2NodeList:
             level2NodeQ = PyQuery(level2Node)
             categoryInfo = self.newCategory()
             categoryInfo.name = level2NodeQ.text()
             categoryInfo.url = level2NodeQ.attr('href')
             categoryInfo.parent_categories = [level1Name]
             categoryList.append(categoryInfo.formalize())
     return categoryList
Пример #60
0
def get_doc_hyperlinking(doc: PyQuery,
                         base_url: str) -> List[HyperLinkingInPage]:
    """
    获取网页的超链接列表

    Parameters
    ----------
    doc : PyQuery
        整个文档的 pyquery 对象

    base_url : str
        网页的地址信息,用于将相对地址转换成绝对地址
    """
    rlt = []
    doc.make_links_absolute(base_url=base_url)
    all_href = doc("a")
    body_text = get_pq_object_inner_text(doc)
    ls_href_to_query = []
    for link in all_href:
        link_obj = PyQuery(link)
        url = str(link_obj.attr("href"))
        if not url.startswith("http"):
            continue
        ls_href_to_query.append(link_obj)
    ls_start_pos = batch_get_dom_node_start_pos(doc, ls_href_to_query)
    for ui_ele, start_pos in zip(ls_href_to_query, ls_start_pos):
        if start_pos < 0:
            logger.error(f"Can't find ui object '{ui_ele}'")
        text = get_pq_object_inner_text(ui_ele)
        if text != body_text[start_pos:start_pos + len(text)]:
            logger.error(
                f"inner text is not equal with doc body '{text}' ?= '{body_text[start_pos:start_pos+len(text)]}'"
            )
        url = str(ui_ele.attr("href"))
        hyperlinking_in_page = HyperLinkingInPage(start_pos=start_pos,
                                                  end_pos=start_pos +
                                                  len(text),
                                                  text=text,
                                                  url=url,
                                                  query_obj=ui_ele)
        rlt.append(hyperlinking_in_page)
    return rlt