def find_video_links(self, html_message): soup = BeautifulSoup(html_message) embeds = soup('embed') tags = [] for video in embeds: tags.append(db.Text(str(video))) return tags
def updateprojectlist(): print "updating the projects list" conn = httplib.HTTPConnection("android.git.kernel.org") conn.request("GET", "/") res = conn.getresponse() if res.status == httplib.OK: data = res.read() #print data conn.close() soup = BeautifulSoup(data) table = soup.body.table #print soup.body.table # filter tags = table.findAll('a', attrs={ 'class': 'list', 'title': None, 'href': re.compile('^/\?p') }) #print tags projectlist = [] for tag in tags: projectlist.append(tag.string) file = open(currentdir + "/" + listfilename, "w") #writelines won't add the '\n' file.writelines(map(lambda x: x.strip() + "\n", projectlist)) file.close() else: print "fail to download the page: ", res.status, res.reason
def find_image_links(self, html_message): soup = BeautifulSoup(html_message) images = soup('img') links = [] for img in images: links.append(db.Link(img['src'])) return links
def hyphenate_html(html, language='en-us', hyphenator=None, blacklist_tags= ('code', 'tt', 'pre', 'head', 'title', 'script', 'style', 'meta', 'object', 'embed', 'samp', 'var', 'math', 'select', 'option', 'input', 'textarea') ): r""" Hyphenate a fragement of HTML >>> hyphenate_html('<p>It is <em>beautiful</em> outside today!</p>') u'<p>It is <em>beau­ti­ful</em> out­side today!</p>' >>> hyphenate_html('O paralelepipedo atrevessou a rua', 'pt-br') u'O pa­ra­le­le­pi­pe­do atre­ves­sou a rua' Content inside <code>, <tt>, and <pre> blocks is not hyphenated >>> hyphenate_html('Document: <code>document + page_status</code>') u'Doc­u­ment: <code>document + page_status</code>' Short words are not hyphenated >>> hyphenate_html("<p>The brave men, living and dead.</p>") u'<p>The brave men, liv­ing and dead.</p>' """ # Load hyphenator if one is not provided if not hyphenator: hyphenator = get_hyphenator_for_language(language) # Create HTML tree soup = BeautifulSoup(html) # Recursively hyphenate each element hyphenate_element(soup, hyphenator, blacklist_tags) return unicode(soup)
def render(self): content = cache.get(self.content_url) # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(self.content_url, timeout=5).read() # Save the page in cache cache.set(self.content_url, content) soup = BeautifulSoup(content) # Make links absolute, quoted from http://stackoverflow.com/a/4468467: for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def parse_summary(self, summary, link): """处理文章""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 for img in list(soup.findAll('img')): if (self.max_image_number >= 0 and img_count >= self.max_image_number) \ or img.has_key('src') is False \ or img['src'].startswith("http://union.vancl.com/") \ or img['src'].startswith("http://www1.feedsky.com/") \ or img['src'].startswith("http://feed.feedsky.com/~flare/"): img.extract() else: try: localimage = self.down_image(img['src'], link) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() except Exception, e: print e img.extract()
def get_organic_data(html_data): bs = BeautifulSoup(str(html_data)) div_filter = bs.find('div', {'id': 'ires'}) if div_filter: contents = div_filter.findAll('li', {'class': 'g'}) return contents return None
def get_script_urls(self, url, html): script_urls = [] scripts = BeautifulSoup(html, parseOnlyThese=SoupStrainer('script')) for tag in scripts: if tag.has_key('src'): script_urls.append(self.get_absolute_url(url, tag['src'])) return script_urls
def clawdata(data): data = urllib.urlencode(data) url = "http://www.powerball.com/powerball/pb_nbr_history.asp" response = urllib2.urlopen(url, data) soup = BeautifulSoup(response) for tag in soup.findAll(valign="middle"): csoup = BeautifulSoup(str(tag)) dictIssue = dict() dictIssue["issueDate"] = "" dictIssue["luckNum"] = [] if csoup.tr != None: for tag in csoup.tr.findAll('td'): if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)): dictIssue["issueDate"] = str(tag.text) elif str(tag.text) != " ": dictIssue["luckNum"].append(int(tag.text)) print dictIssue
def fetchSong(url, viewCount): try: #Get song info from url songInfo = {} _get = url.split('?')[1] tokens = _get.split('&') for token in tokens: toks = token.split('=') songInfo[toks[0]] = int(toks[1]) #fetch the html lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) lyrics = soup.findAll(id="mylrc")[0].contents author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0] album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0] title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0] #print lyrics lyricsText = '' for line in lyrics: for t in line: lyricsText += t #Construct the xml root = ET.Element("xml") doc = ET.SubElement(root, "doc") sidNode = ET.SubElement(doc, "sid") sidNode.text = str(songInfo[u'sid']) aidNode = ET.SubElement(doc, "aid") aidNode.text = str(songInfo[u'aid']) lidNode = ET.SubElement(doc, "lid") lidNode.text = str(songInfo[u'lid']) titleNode = ET.SubElement(doc, "title") titleNode.text = title authorNode = ET.SubElement(doc, "author") authorNode.text = author viewCountNode = ET.SubElement(doc, "viewCount") viewCountNode.text = str(viewCount) lyricsNode = ET.SubElement(doc, "lyrics") lyricsNode.text = lyricsText #Construct the tree tree = ET.ElementTree(root) filename = lyricsDbPath + str(songInfo['lid']) + ".txt" tree.write(filename, "utf-8") except: pass
def parse_summary(self, summary, ref): """处理文章内容,去除多余标签并处理图片地址""" soup = BeautifulSoup(summary) for span in list(soup.findAll(attrs={"style": "display: none;"})): span.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr: True}): del x[attr] for tag in soup.findAll(self.remove_tags): tag.extract() img_count = 0 images = [] for img in list(soup.findAll('img')): if (krconfig.max_image_per_article >= 0 and img_count >= krconfig.max_image_per_article) \ or img.has_key('src') is False : img.extract() else: try: if img['src'].encode('utf-8').lower().endswith( ('jpg', 'jpeg', 'gif', 'png', 'bmp')): localimage, fullname = self.parse_image(img['src']) # 确定结尾为图片后缀,防止下载非图片文件(如用于访问分析的假图片) if os.path.isfile(fullname) is False: images.append({ 'url': img['src'], 'filename': fullname, 'referer': ref }) if localimage: img['src'] = localimage img_count = img_count + 1 else: img.extract() else: img.extract() except Exception, e: logging.info("error: %s" % e) img.extract()
def parse_organic_contents(raw_content, organic_pos): data_dict = {} data_dict['position'] = organic_pos b = BeautifulSoup(raw_content) rtitle = b.find('a') headline = p.sub('', str(rtitle)) data_dict['title'] = headline display_url = parse_display_url(str(raw_content)) data_dict['display_url'] = display_url rhref = b.find('a', href=True) url = str(rhref['href']) data_dict['url'] = ul.unquote(url) rtext = b.findAll('div', {'class': 's'}) text = p.sub('', str(rtext)) data_dict['text'] = text.replace(']', '').replace('[', '') return data_dict
def getViewCount(songTitle): try: youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q=' #songTitle = urllib2.quote(songTitle) #print songTitle url = youtube + songTitle #print url web = urllib2.urlopen(url) content = web.read() web.close() soup = BeautifulSoup(content) stats = soup.findAll('yt:statistics') return int(stats[0]['viewcount']) except: return 0
def parse_response(self): soup = BeautifulSoup(self.response) head = soup.find("head") self.max_points = int( _get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0)) if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted": self.is_accepted = True meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"}) if meta_title: self.meta["title"] = meta_title else: title = soup.find("title") if title: self.meta["title"] = title.contents self.meta["description"] = _get_value_from_soup( head, "meta", "content", {"name": "DC.Description"}, "") points = _get_value_from_soup(head, "meta", "value", {"name": "points"}) if points != None: self.points = int(points) self.is_graded = True self.is_accepted = True exercise_div = soup.body.find("div", {"id": "exercise"}) if exercise_div != None: self.content = exercise_div.renderContents() else: self.content = soup.body.renderContents()
def GEN(book=None, prov=None): errmsg = '' provider = "libgen.io" if prov is None: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if 'search.php' in search and len(rows) > 1: rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.findAll('td') if 'index.php' in search and len(td) > 3: try: res = str( BeautifulStoneSoup( td[0].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( BeautifulStoneSoup( td[2].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) temp = str(td[4]) temp = temp.split('onmouseout')[1] extn = temp.split('">')[1].split('(')[0] size = temp.split('">')[1].split('(')[1].split( ')')[0] size = size.upper() link = temp.split('href=')[1].split('"')[1] except IndexError as e: logger.debug( 'Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: try: res = str( BeautifulStoneSoup( td[1].text, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) author = formatAuthorName(res) title = str( td[2]).split('>')[2].split('<')[0].strip() title = str( BeautifulStoneSoup( title, convertEntities=BeautifulStoneSoup. HTML_ENTITIES)) link = str(td[2]).split('href="')[1].split( '?')[1].split('"')[0] size = unaccented(td[7].text).upper() extn = td[8].text except IndexError as e: logger.debug( 'Error parsing libgen search.php results; %s' % str(e)) if not size: size = 0 else: try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if not link.startswith('http'): if "/ads.php?" in link: url = url_fix(host + link) else: url = url_fix(host + "/ads.php?" + link) else: url = redirect_url(host, link) bookresult, success = fetchURL(url) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug( u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(url) logger.debug( 'Error fetching link data from %s: %s' % (provider, bookresult)) errmsg = bookresult bookresult = False if bookresult: url = None try: new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output: if output.startswith( 'http' ) and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split( '/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split( '/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.debug( 'Error parsing bookresult for %s: %s' % (link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results, errmsg
def KAT(book=None): provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/usearch/" + book['searchterm']) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[1] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c0 = [] c1 = [] c3 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 3: c0.append(row.findAll('td')[0]) c1.append(row.findAll('td')[1]) c3.append(row.findAll('td')[3]) for col0, col1, col3 in zip(c0, c1, c3): try: title = unaccented( str(col0).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str(col0).split( 'href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(col0).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(col1.text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col3.text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def TPB(book=None): provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host + "/s/?q=" + book['searchterm']) params = {"category": "601", "page": "0", "orderby": "99"} searchURL = providerurl + "&%s" % urllib.urlencode(params) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) try: table = soup.findAll('table')[0] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 2: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) for col1, col2 in zip(c1, c2): try: title = unaccented( str(col1).split('title=')[1].split('>')[1].split('<')[0]) magnet = str(col1).split('href="')[1].split('"')[0] size = unaccented(col1.text.split(', Size ')[1].split('iB')[0]) mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(col2.text) except ValueError: seeders = 0 if minimumseeders < seeders: # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: if minimumseeders < seeders: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet' }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def makelocal(self, feed_data, feed_idx, force_full_text=0): '''生成解析结果''' global updated_feeds global feedlock try: local = { 'idx': feed_idx, 'entries': [], 'title': feed_data.feed['title'], } item_idx = 1 for entry in feed_data.entries: if item_idx > krconfig.max_items_number: break try: published_datetime = datetime(*entry.published_parsed[0:6]) except: published_datetime = self.parsetime(entry.published) if datetime.utcnow( ) - published_datetime > krconfig.max_old_date: break try: local_author = entry.author except: local_author = "null" local_entry = { 'idx': item_idx, 'title': entry.title, 'published': (published_datetime + krconfig.timezone).strftime("%Y-%m-%d %H:%M:%S"), 'url': entry.link, 'author': local_author, } if force_full_text: local_entry['content'], images = self.force_full_text( entry.link) else: try: local_entry['content'], images = self.parse_summary( entry.content[0].value, entry.link) except: local_entry['content'], images = self.parse_summary( entry.summary, entry.link) local_entry['stripped'] = ''.join( BeautifulSoup( local_entry['content'], convertEntities=BeautifulSoup.HTML_ENTITIES).findAll( text=True))[:200] local['entries'].append(local_entry) for i in images: imgq.put(i) item_idx += 1 if len(local['entries']) > 0: if feedlock.acquire(): updated_feeds.append(local) feedlock.release() else: feedlock.release() logging.info("from feed{} update {} items.".format( feed_idx, len(local['entries']))) else: logging.info("feed{} has no update.".format(feed_idx)) except Exception, e: logging.error("fail(feed{}): {}".format(feed_idx, e))
def GEN(book=None): provider = "libgen" host = lazylibrarian.CONFIG['GEN_HOST'] if not str(host)[:4] == "http": host = 'http://' + host searchURL = url_fix( host + "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" + book['searchterm']) result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) result = False results = [] if result: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: table = soup.findAll('table')[2] rows = table.findAll('tr') except Exception: # no results = no table in result page rows = [] c1 = [] c2 = [] c7 = [] c8 = [] if len(rows) > 1: for row in rows[1:]: if len(row.findAll('td')) > 8: c1.append(row.findAll('td')[1]) c2.append(row.findAll('td')[2]) c7.append(row.findAll('td')[7]) c8.append(row.findAll('td')[8]) for col1, col2, col7, col8 in zip(c1, c2, c7, c8): try: author = unaccented(col1.text) title = unaccented( str(col2).split('>')[2].split('<')[0].strip()) link = str(col2).split('href="')[1].split('?')[1].split('"')[0] size = unaccented(col7.text).upper() extn = col8.text try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn bookURL = url_fix(host + "/ads.php?" + link) bookresult, success = fetchURL(bookURL) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(bookURL) logger.debug('Error fetching data from %s: %s' % (provider, bookresult)) bookresult = False if bookresult: url = None new_soup = BeautifulSoup(bookresult) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('/get.php'): url = output break if url: url = url_fix(host + url) results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct' }) logger.debug('Found %s, Size %s' % (title, size)) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
def view(): addon_handle = int(sys.argv[1]) addon = xbmcaddon.Addon() addonname = addon.getAddonInfo('name') args = urlparse.parse_qs(sys.argv[2][1:]) xbmcplugin.setContent(addon_handle, 'movies') cat=args.get('cat', None) page = args.get('page', None) link = args.get('link', None) catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'}, {'label':'Video Hot','id':'video/hot/'}] #play link if link!=None: link_video=link[0] if link_video.startswith(web_url): r = requests.get(link[0]) html = r.text #xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html) video_src=soup.find('embed', attrs={'id':'zplayer'}) video_flashvars=video_src.get('flashvars') args_video = urlparse.parse_qs(video_flashvars) link_video=args_video['file'][0] xbmc.Player().play(link_video) return #Load cats if cat==None: for cat in catalogues: li = xbmcgui.ListItem(cat['label']) urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') xbmcplugin.endOfDirectory(addon_handle) return #Load noi dung cat if cat!=None: if page==None: page=1 else: page=int(page[0]) r = requests.get(web_url+cat[0]+str(page)) html = r.text xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) data_List=soup.findAll('a',attrs={'class':'play'}) #load item menu for item in data_List: link_item=web_url+item.get('href') if item.get('data-youtubeid')!='': link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid') img_item=item.find('img') img_src=img_item.get('src') img_alt=img_item.get('alt') li = xbmcgui.ListItem(img_alt) li.setThumbnailImage(img_src) li.setInfo(type='image',infoLabels="") urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li) #Tao nut next li = xbmcgui.ListItem("Next") urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1}); xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') #xbmc.executebuiltin("ClearSlideshow") #xbmc.executebuiltin("SlideShow(,,notrandom)") xbmcplugin.endOfDirectory(addon_handle) return xbmcplugin.endOfDirectory(addon_handle)
def soup(string, **kwargs): """Create a BeautifulSoup parse object from a string""" from lib.BeautifulSoup import BeautifulSoup return BeautifulSoup(string, **kwargs)
def TDL(book=None, test=False): errmsg = '' provider = "torrentdownloads" host = lazylibrarian.CONFIG['TDL_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) data, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in data: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, data)) errmsg = data data = False if test: return success results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 if data: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < int(seeders): # no point requesting the magnet link if not enough seeders # TDL gives us a relative link result, success = fetchURL(providerurl + link) if success: new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TDL_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.findAll('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 2: try: title = unaccented( str(td[1]).split('title=')[1].split('>')[1].split( '<')[0]) magnet = str(td[1]).split('href="')[1].split('"')[0] size = unaccented( td[1].text.split(', Size ')[1].split('iB')[0]) size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def WWT(book=None, test=False): errmsg = '' provider = "WorldWideTorrents" host = lazylibrarian.CONFIG['WWT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/torrents-search.php") sterm = makeUnicode(book['searchterm']) cat = 0 # 0=all, 36=ebooks, 52=mags, 56=audiobooks if 'library' in book: if book['library'] == 'AudioBook': cat = 56 elif book['library'] == 'eBook': cat = 36 elif book['library'] == 'magazine': cat = 52 page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = {"search": book['searchterm'], "page": page, "cat": cat} searchURL = providerurl + "/?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # might return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) try: tables = soup.findAll('table') # un-named table table = tables[2] if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('title="')[1].split('"')[0]) # can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = url_fix(host + '/download.php') + \ str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['WWT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def KAT(book=None, test=False): errmsg = '' provider = "KAT" host = lazylibrarian.CONFIG['KAT_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/usearch/" + urllib.quote(book['searchterm'])) params = {"category": "books", "field": "seeders", "sorder": "desc"} searchURL = providerurl + "/?%s" % urllib.urlencode(params) sterm = makeUnicode(book['searchterm']) result, success = fetchURL(searchURL) if not success: # seems KAT returns 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success results = [] if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 soup = BeautifulSoup(result) rows = [] try: table = soup.findAll('table')[1] # un-named table if table: rows = table.findAll('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 3: try: title = unaccented( str(td[0]).split('cellMainLink">')[1].split('<')[0]) # kat can return magnet or torrent or both. magnet = '' url = '' mode = 'torrent' try: magnet = 'magnet' + str( td[0]).split('href="magnet')[1].split('"')[0] mode = 'magnet' except IndexError: pass try: url = 'http' + str(td[0]).split('href="http')[1].split( '.torrent?')[0] + '.torrent' mode = 'torrent' except IndexError: pass if not url or (magnet and url and lazylibrarian.CONFIG['PREFER_MAGNET']): url = magnet mode = 'magnet' try: size = str(td[1].text).replace(' ', '').upper() mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[3].text) except ValueError: seeders = 0 if not url or not title: logger.debug('Missing url or title') elif minimumseeders < int(seeders): results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': mode, 'priority': lazylibrarian.CONFIG['KAT_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def TDL(book=None): provider = "torrentdownloads" host = lazylibrarian.TDL_HOST if not str(host)[:4] == "http": host = 'http://' + host providerurl = url_fix(host) params = {"type": "search", "cid": "2", "search": book['searchterm']} searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params) try: request = urllib2.Request(searchURL) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) data = urllib2.urlopen(request, timeout=90) except (socket.timeout) as e: logger.debug('Timeout fetching data from %s' % provider) data = False except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e: # may return 404 if no results, not really an error if hasattr(e, 'code') and e.code == 404: logger.debug(searchURL) logger.debug(u"No results found from %s for %s" % (provider, book['searchterm'])) else: logger.debug(searchURL) if hasattr(e, 'reason'): errmsg = e.reason else: errmsg = str(e) logger.debug('Error fetching data from %s: %s' % (provider, errmsg)) data = False results = [] minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1 if data: logger.debug(u'Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) d = feedparser.parse(data) if len(d.entries): for item in d.entries: try: title = item['title'] seeders = int(item['seeders']) link = item['link'] size = int(item['size']) url = None if link and minimumseeders < seeders: # no point requesting the magnet link if not enough seeders request = urllib2.Request(link) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) conn = urllib2.urlopen(request, timeout=90) result = conn.read() url = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): url = output break if minimumseeders < int(seeders): if not url or not title: logger.debug('Missing url or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), }) logger.debug('Found %s. Size: %s' % (title, size)) else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error(u"An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug( u"Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, book['searchterm'])) return results
prefixurl = "https://android.git.kernel.org/" # "git://android.git.kernel.org/" is so easy to lead to time out currentdir = os.path.abspath(os.path.dirname(sys.argv[0])) #the dir of the source repositorydir = ".git" os.chdir(currentdir) # change the work directory, getcwd() conn = httplib.HTTPConnection("android.git.kernel.org") conn.request("GET","/") res = conn.getresponse() if res.status == httplib.OK: data = res.read(); #print data conn.close() soup = BeautifulSoup(data) #print soup.prettify() table = soup.body.table #print soup.body.table # filter tags = table.findAll('a', attrs = {'class' : 'list', 'title': None , 'href' : re.compile('^/\?p')}) #print tags projectlist = [] for tag in tags: projectlist.append(tag.string) file = open(currentdir+"/list.txt","w") #writelines won't add the '\n' file.writelines( map( lambda x: x.strip()+"\n", projectlist ) ); file.close()
except: pass ########################start of main################################### for i in range(startId, endId): url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i) #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51") lyricsWeb = urllib2.urlopen(url) webContent = lyricsWeb.read() lyricsWeb.close() soup = BeautifulSoup(webContent) pages = soup.findAll('a') wantedPages = [] for page in pages: if re.search("&page=", page['href']): #print page['href'] url = page['href'] wantedPages.append(url) if len(wantedPages) > 1: #find those who has more than 20 albums maxPageNum = 1 #Max 1 page for each singer pageNum = 0 maxSongNum = 250 songNum = 0
def determine_min_sdk(): """ Determines the minimum SDK version supported by the vulnerable application\n As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml """ #determine minimum supported versions common.minSdkVersion = 0 common.sdk = common.xmldoc.getElementsByTagName("uses-sdk") determineSdk = '' if len(common.sdk) > 0: if 'android:minSdkVersion' in common.sdk[0].attributes.keys(): try: common.minSdkVersion = common.sdk[0].attributes[ 'android:minSdkVersion'].value logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) except Exception as e: common.logger.error( "Something went wrong trying to determine the version from the manifest: " + str(e)) if common.minSdkVersion == 0: if common.source_or_apk == 2: common.minSdkVersion = find_gradle() if common.minSdkVersion == 0: common.logger.info( "We were unable to find the minimum SDK version in your source." ) determineSdk = 'm' else: logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) else: common.compare(common.sdk.length, 1, common.config.get('qarkhelper', 'USESDK_MISS'), 'false') print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN') while True: determineSdk = raw_input( "Which option would you prefer? (P)lay, (M)anual") if determineSdk.lower() in ('p', 'm'): break else: determineSdk = raw_input("Please enter either (p) or (m):") if determineSdk.lower() == 'p': #get package name from manifest if possible #make call to Play store #determine API version from https://play.google.com/store/apps/details?id=<package name> # will need to adjust the sdk[0] value for the checks below for a in common.xmldoc.getElementsByTagName('manifest'): if 'package' in a.attributes.keys(): print common.config.get('qarkhelper', 'PACK_FOUND') package_name = a.attributes['package'].value print package_name else: package_name = raw_input( common.config.get('qarkhelper', 'NO_PACK_NAME')) try: logger.info( common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION')) play_url = "https://play.google.com/store/apps/details?id=" play_url += package_name print play_url page = urllib2.urlopen(play_url) html = BeautifulSoup(page.read()) play_version = html.find(itemprop="operatingSystems") plat_version = re.findall('\d+.\d+', play_version.contents[0]) if plat_version: plat_version = [str(item) for item in plat_version] api_plat_map = [] api_plat_map.append(['1', '1.0']) api_plat_map.append(['2', '1.1']) api_plat_map.append(['3', '1.5']) api_plat_map.append(['4', '1.6']) api_plat_map.append(['5', '2.0']) api_plat_map.append(['6', '2.0.1']) api_plat_map.append(['7', '2.1']) api_plat_map.append(['8', '2.2']) api_plat_map.append(['9', '2.3']) api_plat_map.append(['10', '2.3.3']) api_plat_map.append(['11', '3.0']) api_plat_map.append(['12', '3.1']) api_plat_map.append(['13', '3.2']) api_plat_map.append(['14', '4.0']) api_plat_map.append(['15', '4.0.3']) api_plat_map.append(['16', '4.1']) api_plat_map.append(['17', '4.2']) api_plat_map.append( ['18', '4.3'] ) #Webviews have critical vuln, no more patches from Google api_plat_map.append(['19', '4.4']) api_plat_map.append( ['20', '4.4'] ) # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes api_plat_map.append(['21', '5.0']) api_plat_map.append( ['22', '5.1'] ) # This is latest version, we'll assume this for newer, until update #TODO - double check this, adding 5.1 may have broken it for a in api_plat_map: if StrictVersion(str( plat_version[0])) >= StrictVersion(str(a[1])): common.minSdkVersion = a[0] logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) manual = raw_input( common.config.get('qarkhelper', 'SDK_VALUE_MANUAL')) else: print common.config.get('qarkhelper', 'CANT_DET_PLAY') #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken except HTTPError, e: print str(e) logger.error( common.config.get('qarkhelper', 'MIN_SDK_PLAY_STORE_FAILED')) elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0): #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case print common.term.cyan + common.term.bold + str( common.config.get('qarkhelper', 'NO_MIN_SDK')).decode( 'string-escape').format(t=common.term) enterSdk = raw_input( common.config.get('qarkhelper', 'PROMPT_MIN_SDK')) if enterSdk.lower() == 'y': sdkinput = 0 while True: sdkinput = int( raw_input( common.config.get('qarkhelper', 'PROMPT_VER') + common.config.get('qarkhelper', 'MAX_API_VERSION') + common.config.get('qarkhelper', 'PROMPT_VER2'))) if 0 < int(sdkinput) <= int( common.config.get('qarkhelper', 'MAX_API_VERSION')): common.minSdkVersion = int(sdkinput) break else: common.minSdkVersion = 7
def get_charset_from_html(self, html): return BeautifulSoup(html).originalEncoding