Exemplo n.º 1
0
def parse_wikicode(input_str, outputformat='vcard', language='english'):
    try:
        language = getattr(__import__('translation.' + language), language)
    except ImportError:
        language = translation.english
    input_str = html_decode(input_str)
    found = []
    for line in input_str.split('\n*'):
        line = '*' + line
        for cls in [Tag, Vcard, Untagged]:
            try:
                found += cls.parse(line, language)
            except ValueError:
                raise

    if not found:
        for line in input_str.split('\n*'):
            found += Untagged.parse(line, language, restrictive=False)

    if outputformat == 'raw':
        return found
    elif outputformat == 'json':
        return [json.dumps(l) for l in found]
    elif outputformat == 'tag':
        return [Tag.tostring(l, language) for l in found]
    elif outputformat == 'vcard':
        return [Vcard.tostring(l, language) for l in found]
    else:
        raise ValueError('Invalid output outputformat: %s' % outputformat)
Exemplo n.º 2
0
def get_from_link(input_str):
    input_str = input_str.strip()
    if (input_str.count('\n') <= 1 and input_str.startswith('http://')
            and 'action=edit' in input_str and 'wikivoyage' in input_str):
        input_str = fake_agent_readurl(input_str)
        t = ElementSoup.parse(StringIO(input_str))
        if sys.version_info[:2] < (2, 7):
            # Xpath too stupid for bracket syntax,
            # fortunately there seems to be only one
            input_str = t.find(".//textarea").text
        else:
            input_str = t.find(".//textarea[@id='wpTextbox1']").text
        return html_decode(input_str)
    return input_str
Exemplo n.º 3
0
    def __init__(self, url):
        self.url = url
        html = self.get_html(url)

        # Добавить имя категории
        self.xml_str += '<category name="%s">' % self.get_name(html)

        # Добавить XML подкатегорий
        for url_and_image in self.get_subcategory_urls_and_images(html):
            prefix = 'http://www.air-gun.ru/'
            url =  prefix + html_decode(url_and_image[0])
            image = prefix + url_and_image[1]
            subcategory = Subcategory(url, image)
            self.xml_str += subcategory.get_xml_str()

        self.xml_str += '</category>'
Exemplo n.º 4
0
def main():
	while True:
		rss_urls = ["http://www.otakubot.org/feed/",
		 "http://www.otakubot.org/feed/?paged=2",
		 "http://www.otakubot.org/feed/?paged=3"]
		d = []
		for url in rss_urls:
			d.extend(feedparser.parse(url).entries)
		try:
			already_used = cPickle.load(open('used_links.pkl', 'r'))
		except:
			already_used = []

		rss_count = 0
		for a in d:
			try:
				skip = False
				summary_html = ""
				post_id = a.guid
				html = ""
				if post_id in already_used:
					rss_count += 1
					continue
				if DEBUG:
					already_used.append(post_id)
					cPickle.dump(already_used, open("used_links.pkl", 'w'))
					continue
	
				try:
					video_rez = utils.html_decode(re.findall('Video: (.*?)\<br />', \
						a.content[0]['value'])[0]).split(',')[2].split('×')[1].lstrip()
				except:
					video_rez = "NONE"
				filename = utils.html_decode(re.findall('Release name: (.*?)\<br />', \
					a.content[0]['value'])[0])
				magnet_link = re.findall('(magnet:\?xt=[^\"<]*)', \
					a.content[0]['value'])
				download_urls = re.findall('<a href="?\'?([^"\'>]*)', \
					a.content[0]['value'])
				download_urls.append(magnet_link)
				if "otakubot" in download_urls[0] or "zupimages" in download_urls[0]:
					download_urls.pop(0)
				if "otakubot" in download_urls[0] or "zupimages" in download_urls[0]:
					download_urls.pop(0)
	
				count = 0
				for url in download_urls:
					if "Go4UP" in url[20:]:
						download_urls[count] = url.replace("Go4UP", "")
					elif "Hugefiles" in url[20:]:
						download_urls[count] = url.replace("Hugefiles", "")
					elif "Uploaded" in url[20:]:
						download_urls[count] = url.replace("Uploaded", "")
					elif "Torrent" in url[20:]:
						download_urls[count] = url.replace("Torrent", "")
					count += 1
	
				episode_number = utils.get_episode_number(filename)
				series_name = utils.get_new_name(utils.get_series_name(filename, episode_number))
				if series_name == "SKIP":
					continue
				episode_number = episode_number + utils.get_remove_ep(series_name)
	
				if episode_number == utils.get_last_ep(series_name):
					# Is last episode
					post_title = "{0} Episode {1} Final".format(series_name, episode_number)
				elif not episode_number:
					# Is movie/ova
					post_title = "{0}".format(series_name)
				else:
					# Is normal episode
					post_title = "{0} Episode {1}".format(series_name, episode_number)
				# CHANGE TO 1
				if episode_number <= 1 or not episode_number:
					# New series
					if not utils.get_if_stored(series_name):
						utils.get_series_info(series_name)
	
				html = utils.html_download_div(series_name, episode_number, video_rez, \
						filename, download_urls)
	
				print "New Post:"
				print post_title
				print
				print "HTML:"
				print html
				already_used.append(post_id)
				break
			except:
				print("~@~@~@~@~@~@error@~@~@~@~@~@~")

		cPickle.dump(already_used, open("used_links.pkl", 'w'))
		time.sleep(15)
Exemplo n.º 5
0
 def clean_key(self):
     if 'keyword' in self.cleaned_data:
         keyword = self.cleaned_data['keyword']
         return html_decode(keyword)
     else:
         return ''
Exemplo n.º 6
0
 def clean_key(self):
     if 'key' in self.cleaned_data:
         key = self.cleaned_data['key']
         return html_decode(key)
     else:
         return ''
Exemplo n.º 7
0
 def __init__(self, id, text, sound, stat=0):
     self._id = id
     self.text = html_decode(text)
     self.sound = sound
     self.stat = stat
Exemplo n.º 8
0
 def runTest(self):
     res = html_decode(self.test_data)
     self.assertGreater(len(res), len(self.test_data) / 2)
     self.assertNotIn('&lt;', res)
     self.assertEqual(res.count('<see name="'), 4)
     self.assertEqual(res.count('</see>'), 4)