Python parse示例，leaf.parse Python示例

示例#1

0

显示文件

文件： spamhaus_sbl_parser.py 项目： ibooj/spamhaus_sbl_parse

    def get_sbl_items(self):
        try:
            page = self.s.get('http://spamhaus.org/sbl/latest/', headers=self.h)
            document = leaf.parse(page.text)
            items = document.xpath('body/div/table[2]/tr[3]/td[2]/table')
            if not items:
                c = document.xpath('body/div/table[2]/tr/td[2]/h1')[0]
                if c:
                    raise Exception(c)
                raise Exception('table items not found')
            for i in items:
                try:
                    self.get_sbl_item(i)
                except Exception as e:
                    sbl_parser_log(e)
                # break
        except Exception as e:
            sbl_parser_log(e)
            reload_tor()
            self.get_sbl_items()

        for item in self.sbl_items:
            obj = SblItem.objects.get_or_create(ref_name=item['ref_name'])[0]
            for key in item:
                setattr(obj, key, item[key])
            obj.save()

        sbl_parser_log('added %s sbl items' % len(self.sbl_items))

示例#2

0

显示文件

文件： nyc.py 项目： maire/john-pybot

  def run(self):

    self.session.get(thread_url)

    while True:
      try:
        sleep(10)
        poll = self.session.get(thread_url)

        if '#lastpost' in poll.url:
          continue

        if self.dead >= dead_threshold - 1:
          continue
        else:
          self.dead += 1

        new_index = int(get_new_post_number.findall(poll.url)[0]) - 1

        page = leaf.parse(poll.text)

        self.callback(self.channel, 'Forums {0} {1} made {2} new post in the NYC thread! {3}'.format(
          choice(user_adjectives) if random() < chance_of_using_a_funny_adjective else 'poster',
          page('.author')[new_index].text,
          choice(post_adjectives) if random() < chance_of_using_a_funny_adjective else 'a',
          poll.url
        ))

      except Exception as e:
        print e

示例#3

0

显示文件

文件： loader.py 项目： jaywhy13/mapstream

	def get_elements_by_css(url, css_selector):
		f = urllib.urlopen(url)
		html = f.read()
		if html:
			doc = leaf.parse(html)
			elements = doc(css_selector)
			return elements
		return []

示例#4

0

显示文件

def test_bbcode():
    document = leaf.parse(sample)
    bbcode = document.parse(bbcode_formatter, 'http://example.com/')
    bbcode = leaf.strip_spaces(bbcode)
    bbcode = leaf.strip_symbols(bbcode)
    bbcode = leaf.strip_linebreaks(bbcode)

    assert bbcode == leaf.to_unicode(sample_result), "Sample bbcode formatter"

示例#5

0

显示文件

文件： test_parser.py 项目： imbolc/Leaf

def test_selectors():
    document = leaf.parse(sample)
    links = document('div#menu a')
    assert links[-1].text == 'Contacts', "Access by id and element type"
    links2 = document('div#menu li a')
    assert links2[-1].text == ' Test link 5', "Access by id and element type 2"
    assert len(document('a')) == 9
    assert document('li.active_link a')[0].text == ' Test link 5', "Access by class"

示例#6

0

显示文件

def test_selectors():
    document = leaf.parse(sample)
    links = document('div#menu a')
    assert links[-1].text == 'Contacts', "Access by id and element type"
    links2 = document('div#menu li a')
    assert links2[-1].text == ' Test link 5', "Access by id and element type 2"
    assert len(document('a')) == 9
    assert document(
        'li.active_link a')[0].text == ' Test link 5', "Access by class"

示例#7

0

显示文件

文件： libdeals.py 项目： maire/dealzone

def getSteamDeals():
  deals = []
  page = requests.get('http://store.steampowered.com/search/?sort_by=Metascore&sort_order=DESC&specials=1')
  temp = leaf.parse(page.text)
  links = temp('.search_result_row.even') + temp('.search_result_row.odd')
  for link in links:
    deals.append(Deal(link('.col.search_name.ellipsis')[0]('h4')[0].text,
                       link('.col.search_price')[0]('br')[0].tail,
                       link('.col.search_price')[0]('strike')[0].text))
  return deals

示例#8

0

显示文件

文件： leaf.py 项目： 31H0B1eV/grab

    def leaf(self):
        """
        Return body parsed by leaf
        """
        import leaf

        if not self._leaf:
            self._leaf = leaf.parse(self.response.unicode_body(),
                    encoding=self.charset)
        return self._leaf

示例#9

0

显示文件

    def leaf(self):
        """
        Return body parsed by leaf
        """
        import leaf

        if not self._leaf:
            self._leaf = leaf.parse(self.response.unicode_body(),
                                    encoding=self.charset)
        return self._leaf

示例#10

0

显示文件

文件： acomic_spyder.py 项目： NeroonY/a-comics.ru-Spider

def GetImageUrl(page):
    doc = leaf.parse(page)
    data = doc(".issue img")
    logging.info(data)

    logging.info("Found img links" + str(len(data)))

    if len(data) == 1:
        img_block = data[0]
        if hasattr(img_block, "src"):
            logging.info(data[0].src)
            return data[0].src

示例#11

0

显示文件

文件： test_parser.py 项目： imbolc/Leaf

def test_attribs():
    document = leaf.parse(sample)
    first_link = document.get('div#menu li')
    assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index"
    assert document.get('div#menu a', 99, default='blah') == 'blah', "Custom default value for get"
    assert bool(document.get('div#menu li')) == True, "Node bool"
    assert bool(document.get('div#menu_test li')) == False, "Node bool"
    assert isinstance(first_link, leaf.Parser), "Get first element"
    assert first_link.id == 'first_link', "Id attrib"
    assert first_link.onclick == "alert('test')", "Onclick attrib"
    first_link.onclick = 'blah()'
    assert first_link.onclick == 'blah()', "Attribute modification"

示例#12

0

显示文件

    def on_new_comment(self, io, data, *ex_prms):
        try:
            username = data['lastpostuser']
            user_id = data['lastpostuserid']
            if user_id == self.user.user_id:
                return
            '''
			r = self.user.sess.get('https://www.fxp.co.il/showthread.php', params={
				't': data['id'],
				'page': data['pages'],
				'web_fast_fxp': 1
			})
			# comment = document.xpath(f'//div[@class="user-pic-holder user_pic_{user_id}"]/../../../../..')[-1]
			'''

            # new way
            r = self.user.sess.get('https://www.fxp.co.il/showthread.php',
                                   params={
                                       't': data['id'],
                                       'pp': 1,
                                       'page': data['posts'] + 1,
                                       'web_fast_fxp': 1
                                   })

            forum_id = int(
                re.search(r'FORUM_ID_FXP\s*=\s*"(.+?)"', r.text).group(1))

            document = leaf.parse(r.text)

            comment = document.xpath(f'//ol[@id="posts"]//li')[0]

            comment_content = comment.xpath(
                './/blockquote[@class="postcontent restore "]')[0]
            comment_id = int(comment.id.replace('post_', ''))
            parsed_content = comment_content.parse(
                self.bbcode_formatter).strip()

            quoted_me = self.is_quoted_me(comment_content)

            self.events.emit(
                FxpComment,
                FxpComment(username=username,
                           user_id=user_id,
                           id=int(comment_id),
                           content=parsed_content,
                           thread_id=int(data['id']),
                           thread_title=data['title'],
                           posts_number=int(data['posts']),
                           forum_id=forum_id,
                           quoted_me=quoted_me))
        except Exception as e:
            # raise
            pass

示例#13

0

显示文件

文件： test_parser.py 项目： jean/leaf

def test_attribs():
    document = leaf.parse(sample)
    first_link = document.get('div#menu li')
    assert document.get('div#menu a', 4).text == ' Test link 5', "Get element by index"
    assert document.get('div#menu a', 4).__unicode__() == ' Test link 5', "Unicode test on python2"
    assert document.get('div#menu a', 99, default='blah') == 'blah', "Custom default value for get"
    assert document.find('body/div[4]').tag == 'div', "ETree find"
    assert bool(document.get('div#menu li')), "Node bool"
    assert bool(document.get('div#menu_test li')) is False, "Node bool"
    assert isinstance(first_link, leaf.Parser), "Get first element"
    assert first_link.id == 'first_link', "Id attrib"
    assert first_link.onclick == "alert('test')", "Onclick attrib"
    first_link.onclick = 'blah()'
    assert first_link.onclick == 'blah()', "Attribute modification"

示例#14

0

显示文件

文件： import_phpbb.py 项目： vencax/feincms-forum

    def processObject(self, o):
        if not Post.objects.filter(body__iexact=unicode_fix(o.post_text)).exists():
            author = self._getAuthor(o, o.poster)
            created = datetime.datetime.fromtimestamp(o.post_time)
            try:
                topic = Topic.objects.get(name=unicode_fix(o.topic.topic_title))
            except Topic.DoesNotExist:
                topic = Topic.objects.get(name__icontains=unicode_fix(o.topic.topic_title))

            text = self._process_text(unicode_fix(o.post_text))
            doc = leaf.parse(text)
            text = doc.parse(bbcode_formatter, self._originalAddress)

            Post(topic=topic, body=text, user_ip=o.poster_ip, user=author, created=created).save()

示例#15

0

显示文件

def test_attribs():
    document = leaf.parse(sample)
    first_link = document.get('div#menu li')
    assert document.get('div#menu a',
                        4).text == ' Test link 5', "Get element by index"
    assert document.get(
        'div#menu a', 99,
        default='blah') == 'blah', "Custom default value for get"
    assert bool(document.get('div#menu li')) == True, "Node bool"
    assert bool(document.get('div#menu_test li')) == False, "Node bool"
    assert isinstance(first_link, leaf.Parser), "Get first element"
    assert first_link.id == 'first_link', "Id attrib"
    assert first_link.onclick == "alert('test')", "Onclick attrib"
    first_link.onclick = 'blah()'
    assert first_link.onclick == 'blah()', "Attribute modification"

示例#16

0

显示文件

文件： __init__.py 项目： kanterov/codefest_schedule

def get_schedule():
    html = urllib2.urlopen("http://codefest.ru/program/2011-03/").read()
    doc = leaf.parse(html)
    calendar = get_calendar()

    programs = doc("table.program tbody")

    section = 0
    day = 0

    for program in programs:
        talks = program("tr")
        
        for talk in talks:
            time_tag = talk("td")[0]
            about_tag = talk("td")[-1]
            topic_tag = talk.get("td a")

            start = (None, None)
            end = (None, None)
            topic = None
            topic_about = None
            speaker = None
            name = None

            if time_tag is not None:
                time = time_tag.text
                r = re.search(u".*(\d\d):(\d\d).*(\d\d):(\d\d).*", time) 
                if r is not None:
                    start = (int(r.group(1)), int(r.group(2)))
                    end = (int(r.group(3)), int(r.group(4)))


            if topic_tag is not None:
                topic = topic_tag.text
                topic_about = topic_tag.href

            if about_tag is not None:
                name = about_tag.text

            print time_tag.text, topic, topic_about, DAYS[day][section]
            add_event(calendar, start, end, name, topic, topic_about, day, DAYS[day][section])

        section += 1 

        if len(DAYS[day]) == section:
            day += 1
            section = 0

示例#17

0

显示文件

def test_attribs():
    document = leaf.parse(sample)
    first_link = document.get('div#menu li')
    assert document.get('div#menu a',
                        4).text == ' Test link 5', "Get element by index"
    assert document.get(
        'div#menu a',
        4).__unicode__() == ' Test link 5', "Unicode test on python2"
    assert document.get(
        'div#menu a', 99,
        default='blah') == 'blah', "Custom default value for get"
    assert document.find('body/div[4]').tag == 'div', "ETree find"
    assert bool(document.get('div#menu li')), "Node bool"
    assert bool(document.get('div#menu_test li')) is False, "Node bool"
    assert isinstance(first_link, leaf.Parser), "Get first element"
    assert first_link.id == 'first_link', "Id attrib"
    assert first_link.onclick == "alert('test')", "Onclick attrib"
    first_link.onclick = 'blah()'
    assert first_link.onclick == 'blah()', "Attribute modification"

示例#18

0

显示文件

文件： import_phpbb.py 项目： Navpreet2289/feincms-forum

    def processObject(self, o):
        if not Post.objects.filter(
                body__iexact=unicode_fix(o.post_text)).exists():
            author = self._getAuthor(o, o.poster)
            created = datetime.datetime.fromtimestamp(o.post_time)
            try:
                topic = Topic.objects.get(
                    name=unicode_fix(o.topic.topic_title))
            except Topic.DoesNotExist:
                topic = Topic.objects.get(
                    name__icontains=unicode_fix(o.topic.topic_title))

            text = self._process_text(unicode_fix(o.post_text))
            doc = leaf.parse(text)
            text = doc.parse(bbcode_formatter, self._originalAddress)

            Post(topic=topic,
                 body=text,
                 user_ip=o.poster_ip,
                 user=author,
                 created=created).save()

示例#19

0

显示文件

文件： fxapi.py 项目： MamadFlowAbadan/FXaPi-V2

    def get_forum_threads(self, forum_id, page=1, post_per_page=25):
        """Get list of the threads in the forum
		Args:
			forum_id (int): Forum id.
			page (int): Page number.
			post_per_page (int) Posts per page (MAX=200).

		Returns:
			list: List of ids.
		"""
        r = self.sess.get('https://www.fxp.co.il/forumdisplay.php',
                          params={
                              'f': forum_id,
                              'page': page,
                              'pp': post_per_page,
                              'web_fast_fxp': 1
                          })
        return [
            int(thread_id.replace('thread_', '')) for thread_id in leaf.parse(
                r.text).xpath(f'//ul[@id="threads"]//li/@id')
        ]

示例#20

0

显示文件

    def on_new_thread(self, io, data, *ex_prms):
        try:
            if data['poster'] == self.user.user_id:
                return
            r = self.user.sess.get('https://www.fxp.co.il/showthread.php',
                                   params={
                                       't': data['id'],
                                       'web_fast_fxp': 1
                                   })

            forum_id = int(
                re.search(r'FORUM_ID_FXP\s*=\s*"(.+?)"', r.text).group(1))

            document = leaf.parse(r.text)
            thread_content = document.xpath(
                './/blockquote[@class="postcontent restore simple"]')[0]
            comment_id = int(thread_content.getparent().id.replace(
                'post_message_', ''))

            quoted_me = self.is_quoted_me(thread_content)
            parsed_content = thread_content.parse(
                self.bbcode_formatter).strip()

            self.events.emit(
                FxpThread,
                FxpThread(username=data['username'],
                          user_id=data['poster'],
                          id=data['id'],
                          title=data['title'],
                          content=parsed_content,
                          comment_id=comment_id,
                          prefix=data['prefix'],
                          forum_id=forum_id,
                          quoted_me=quoted_me))

        except Exception as e:
            # print(e)
            pass

示例#21

0

显示文件

文件： spamhaus_sbl_parser.py 项目： ibooj/spamhaus_sbl_parse

    def get_sbl_item(self, i):
        sbl_item = {}

        status = i.xpath('tr/td[1]/img')[0].src.replace('/images/', '').replace('.gif', '')
        if status == 'spacer':
            status = i.xpath('tr/td[last()]/div/img')[0].src.replace('/images/', '').replace('.gif', '')
        sbl_item.update({'status': status})

        date = i.xpath('tr[2]/td[1]/span')[0].text
        sbl_item.update({'date': date})

        ref = i.xpath('tr/td[2]/span')[0].get('a')
        if ref:
            sbl_item.update({'ref_href': ref.href})
            page = self.s.get('http://spamhaus.org%s' % sbl_item['ref_href'], headers=self.h)
            document = leaf.parse(page.text)
            detail_text = document.xpath('body/div/table[2]/tr[2]/td[2]')[0]
            # detail_text_data = document.xpath('body/div/table[2]/tr[2]/td[2]/table/tr[3]/td')[0]
            # sbl_item.update({'date': detail_text_data.get('span').text.replace('|', '').strip()})
            sbl_item.update({'ref_detail_text': detail_text.inner_html()})
            ref_name = ref.get('b').text
        else:
            sbl_item.update({'ref_href': None})
            ref_name = i.xpath('tr/td[2]/span/b/font')[0].text
        sbl_item.update({'ref_name': ref_name})

        network = i.xpath('tr/td[3]/span')[0].text
        sbl_item.update({'network': network})

        domen = i.xpath('tr/td[4]/span')[0].get('a').text
        sbl_item.update({'domen': domen})

        ptext = i.xpath('tr[2]/td[2]/span')[0].text
        sbl_item.update({'ptext': ptext})

        self.sbl_items.append(sbl_item)

示例#22

0

显示文件

文件： fxapi.py 项目： MamadFlowAbadan/FXaPi-V2

    def like(self, comment_id):
        """Like comment
		Args:
			comment_id (str/int): The id of the comment.

		Returns:
			bool: True for success, False otherwise.
		"""

        r = self.sess.post('https://www.fxp.co.il/ajax.php',
                           data={
                               'do': 'add_like',
                               'postid': comment_id,
                               'securitytoken': self.securitytoken
                           })

        r = self.sess.get(
            f'https://www.fxp.co.il/showthread.php#post{comment_id}',
            params={'p': comment_id})

        # ----------- fix this pls ----------- will return true if the comment doesnt exists ----------

        return leaf.parse(
            r.text).xpath(f'//span[@id="{comment_id}_removelike"]') == []

示例#23

0

显示文件

文件： test_parser.py 项目： imbolc/Leaf

def test_inner_html():
    html = '''<div>xxx <!-- comment --> yyy <p>foo</p> zzz</div>'''
    dom = leaf.parse(html)
    assert dom.inner_html() == 'xxx <p>foo</p> zzz'

示例#24

0

显示文件

文件： test_parser.py 项目： imbolc/Leaf

def test_inner_methods():
    document = leaf.parse(sample)
    link = document.xpath('body/div/ul/li[@class="active_link"]')[0]
    assert link.get('a').text == ' Test link 5', 'XPath by inner lxml method'

示例#25

0

显示文件

文件： test_parser.py 项目： imbolc/Leaf

def test_html():
    document = leaf.parse(sample)
    link = document.get('div#content li.link2')
    assert link.html() == '<li class="link2"><a href="#3"> Test link3</a></li>\n\t\t', "Convert element to html code"

示例#26

0

显示文件

def test_inner_html():
    html = '''<div>xxx <!-- comment --> yyy <p>foo</p> zzz</div>'''
    dom = leaf.parse(html)
    assert dom.inner_html() == 'xxx <p>foo</p> zzz'

示例#27

0

显示文件

文件： schedule_parser.py 项目： dskecse/bseu_schedule

def get_document(raw_html_schedule):
    return leaf.parse(leaf.strip_symbols(leaf.strip_accents(show(raw_html_schedule))))

示例#28

0

显示文件

文件： schedule_parser.py 项目： babich21/bseu_schedule

def get_document(raw_html_schedule):
    return leaf.parse(
        leaf.strip_symbols(leaf.strip_accents(show(raw_html_schedule))))

示例#29

0

显示文件

def test_html():
    document = leaf.parse(sample)
    link = document.get('div#content li.link2')
    assert link.html(
    ) == '<li class="link2"><a href="#3"> Test link3</a></li>\n\t\t', "Convert element to html code"

示例#30

0

显示文件

文件： leaf.py 项目： 31H0B1eV/grab

 def leaf(self):
     if not getattr(self, '_leaf', None):
         self._leaf = parse(self.body, encoding=self.charset)
     return self._leaf

示例#31

0

显示文件

文件： loader.py 项目： jaywhy13/mapstream

	def load(self, store_data = True, date_limit=None, run_agent=False):
		for data_src in self.data_sources:
			print "Loading data from: %s" % data_src

			# init variables from the data source
			url = data_src.src_id
			source_node = data_src
			parameters = data_src.get_parameters()
			username = parameters.get('username','*****@*****.**')
			psw = parameters.get('password','choirpassword')
			article_css_selector = parameters.get('article-css-selector','')
			fetch_limit = parameters.get('fetch-limit',None)

			auth = ClientAuthMethod(username,psw)
			
			reader = GoogleReader(auth)
			if reader.buildSubscriptionList():
				feeds = reader.getSubscriptionList()
				new_tag = DataTag.objects.get(name='new')
				new_datas = []

				fetch_count = 0

				# loop through and store feeds we already have RawData for


				for feed in feeds:
					if not fetch_limit:
						fetch_limit = feed.unread
					read_items = []
					print "Reading " + feed.title + " (%s unread)" % feed.unread
					print "===================================================="
					print
					print "Loading items"
					print
					feed.loadItems()
					print "Loaded %s items" % (len(feed.items),)
					print
					index = 0
					for item in feed.items:
						# make sure it doesn't already exist
						title = item.title
						url = item.url
						index+=1

						if index + 1 >= len(feed.items) and fetch_count < fetch_limit:
							print "Loading more items...."
							print
							feed.loadMoreItems()

						f = urllib.urlopen(url)
						html = f.read()
						doc = leaf.parse(html)
						elements = doc(article_css_selector)
						for element in elements:
							# print
							article_html = element.html()
							new_data = RawData()
							new_data.title = title
							new_data.source = source_node
							new_data.data = strip_tags(article_html)
							new_data.data_id = item.id
							new_data.link = item.url

							try:
								new_data.occurred_at = datetime.datetime.fromtimestamp(feed.lastUpdated)
							except ValueError:
								# print "Error, could not parse timestamp: %s" % feed.lastUpdated
								new_data.occurred_at = datetime.datetime.now()

							# patching in date limit thing Parris wanted --------------------------
							# if date_limit is None:
							#	date_limit = datetime.date.today() - datetime.timedelta(week=1)
							#
							# if new_data.occured_at < date_limit:
							# 	# we should skip this item .... it is too old
							# 	continue
							#
							# end patch -----------------------------------------------------------
							# Abandonning this idea for now ... I think it's best to patch the map view and not mess with this for now

								
							# if it is not new... save it
							if not new_data.exists():
								print " + Saving article: %s" % new_data.title
								new_data.save()
								new_data.tags.add(new_tag)
								new_datas.append(new_data)
								fetch_count +=1

							read_items.append(item)


					# print "All done.\n %s items fetched, our limit is %s. There are %s feeds. We stopped at index %s" % (fetch_count, self.fetch_limit, len(feed.items),index)

			if new_datas and run_agent:
				gra = GoogleReaderAgent()
				gra.search(raw_data_set = new_datas)
			return new_datas
		return None

示例#32

0

显示文件

 def leaf(self):
     if not getattr(self, '_leaf', None):
         self._leaf = parse(self.body, encoding=self.charset)
     return self._leaf

示例#33

0

显示文件

def test_inner_methods():
    document = leaf.parse(sample)
    link = document.xpath('body/div/ul/li[@class="active_link"]')[0]
    assert link.get('a').text == ' Test link 5', 'XPath by inner lxml method'