Python BeautifulSoup.BeautifulSoup示例，lib.BeautifulSoup.BeautifulSoup.BeautifulSoup Python示例

示例#1

0

显示文件

 def find_video_links(self, html_message):
     soup = BeautifulSoup(html_message)
     embeds = soup('embed')
     tags = []
     for video in embeds:
         tags.append(db.Text(str(video)))
     return tags

示例#2

0

显示文件

def updateprojectlist():
    print "updating the projects list"
    conn = httplib.HTTPConnection("android.git.kernel.org")
    conn.request("GET", "/")
    res = conn.getresponse()

    if res.status == httplib.OK:
        data = res.read()
        #print data
        conn.close()

        soup = BeautifulSoup(data)
        table = soup.body.table
        #print soup.body.table

        # filter
        tags = table.findAll('a',
                             attrs={
                                 'class': 'list',
                                 'title': None,
                                 'href': re.compile('^/\?p')
                             })
        #print tags

        projectlist = []
        for tag in tags:
            projectlist.append(tag.string)

        file = open(currentdir + "/" + listfilename, "w")
        #writelines won't add the '\n'
        file.writelines(map(lambda x: x.strip() + "\n", projectlist))
        file.close()

    else:
        print "fail to download the page: ", res.status, res.reason

示例#3

0

显示文件

 def find_image_links(self, html_message):
     soup = BeautifulSoup(html_message)
     images = soup('img')
     links = []
     for img in images:
         links.append(db.Link(img['src']))
     return links

示例#4

0

显示文件

def hyphenate_html(html, language='en-us', hyphenator=None, blacklist_tags= ('code', 'tt', 'pre', 'head', 'title', 'script', 'style', 'meta', 'object', 'embed', 'samp', 'var', 'math', 'select', 'option', 'input', 'textarea') ):
    r"""
    Hyphenate a fragement of HTML

    >>> hyphenate_html('<p>It is <em>beautiful</em> outside today!</p>')
    u'<p>It is <em>beau&shy;ti&shy;ful</em> out&shy;side today!</p>'

    >>> hyphenate_html('O paralelepipedo atrevessou a rua', 'pt-br')
    u'O pa&shy;ra&shy;le&shy;le&shy;pi&shy;pe&shy;do atre&shy;ves&shy;sou a rua'

    Content inside <code>, <tt>, and <pre> blocks is not hyphenated
    >>> hyphenate_html('Document: <code>document + page_status</code>')
    u'Doc&shy;u&shy;ment: <code>document + page_status</code>'

    Short words are not hyphenated

    >>> hyphenate_html("<p>The brave men, living and dead.</p>")
    u'<p>The brave men, liv&shy;ing and dead.</p>'
    """
    # Load hyphenator if one is not provided
    if not hyphenator:
        hyphenator = get_hyphenator_for_language(language)

    # Create HTML tree
    soup = BeautifulSoup(html)

    # Recursively hyphenate each element
    hyphenate_element(soup, hyphenator, blacklist_tags)

    return unicode(soup)

示例#5

0

显示文件

文件： models.py 项目： OpenDSA/OpenDSA-devserver

 def render(self):
     content         =  cache.get(self.content_url) 
     
     # If the page is not cached, retrieve it
     if content == None:
         opener      = urllib2.build_opener()
         content     = opener.open(self.content_url, timeout=5).read()
         
         # Save the page in cache
         cache.set(self.content_url, content)
     
     soup            = BeautifulSoup(content)
     
     # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
     for tag in soup.findAll('a', href=True):
         tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
     
     # If there's no element specified, use the BODY. 
     # Otherwise find the element with given id.
     if self.element_id == "":
         html        = soup.find("body").renderContents()
     else:
         html        = str(soup.find(id=self.element_id))
     
     return html

示例#6

0

显示文件

文件： kindlereader.py 项目： hitsmaxft/kindlereader

    def parse_summary(self, summary, link):
        """处理文章"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        for img in list(soup.findAll('img')):
            if (self.max_image_number >= 0  and img_count >= self.max_image_number) \
                or img.has_key('src') is False \
                or img['src'].startswith("http://union.vancl.com/") \
                or img['src'].startswith("http://www1.feedsky.com/") \
                or img['src'].startswith("http://feed.feedsky.com/~flare/"):
                img.extract()
            else:
                try:
                    localimage = self.down_image(img['src'], link)

                    if localimage:
                        img['src'] = localimage
                        img_count = img_count + 1
                    else:
                        img.extract()
                except Exception, e:
                    print e
                    img.extract()

示例#7

0

显示文件

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div', {'id': 'ires'})
    if div_filter:
        contents = div_filter.findAll('li', {'class': 'g'})
        return contents
    return None

示例#8

0

显示文件

 def get_script_urls(self, url, html):
     script_urls = []
     scripts = BeautifulSoup(html, parseOnlyThese=SoupStrainer('script'))
     for tag in scripts:
         if tag.has_key('src'):
             script_urls.append(self.get_absolute_url(url, tag['src']))
     return script_urls

示例#9

0

显示文件

def clawdata(data):
    data = urllib.urlencode(data)
    url = "http://www.powerball.com/powerball/pb_nbr_history.asp"

    response = urllib2.urlopen(url, data)
    soup = BeautifulSoup(response)

    for tag in soup.findAll(valign="middle"):
        csoup = BeautifulSoup(str(tag))
        dictIssue = dict()
        dictIssue["issueDate"] = ""
        dictIssue["luckNum"] = []
        if csoup.tr != None:
            for tag in csoup.tr.findAll('td'):
                if re.search("[0-9]+\/[0-9]+\/[0-9]{4}", str(tag.text)):
                    dictIssue["issueDate"] = str(tag.text)
                elif str(tag.text) != "&nbsp;":
                    dictIssue["luckNum"].append(int(tag.text))
            print dictIssue

示例#10

0

显示文件

def fetchSong(url, viewCount):
    try:
        #Get song info from url
        songInfo = {}
        _get = url.split('?')[1]
        tokens = _get.split('&')
        for token in tokens:
            toks = token.split('=')
            songInfo[toks[0]] = int(toks[1])
        
        #fetch the html
        lyricsWeb = urllib2.urlopen(url)  
        webContent = lyricsWeb.read()  
        lyricsWeb.close()       
    
        soup = BeautifulSoup(webContent)
    
        lyrics = soup.findAll(id="mylrc")[0].contents
        author = soup.findAll(attrs={'class' : 'link_hb'})[0].contents[0]
        album = soup.findAll(attrs={'class' : 'link_hb'})[1].contents[0]
        title = soup.findAll(attrs={'class' : 'link_hb'})[2].contents[0]    
        
        #print lyrics
        lyricsText = ''
        for line in lyrics:
            for t in line:
                lyricsText += t                       
        
        #Construct the xml
        root = ET.Element("xml")
        doc = ET.SubElement(root, "doc")
        
        sidNode = ET.SubElement(doc, "sid")
        sidNode.text = str(songInfo[u'sid'])
        aidNode = ET.SubElement(doc, "aid")
        aidNode.text = str(songInfo[u'aid'])
        lidNode = ET.SubElement(doc, "lid")
        lidNode.text = str(songInfo[u'lid'])        
        titleNode = ET.SubElement(doc, "title")
        titleNode.text = title
        authorNode = ET.SubElement(doc, "author")
        authorNode.text = author
        viewCountNode = ET.SubElement(doc, "viewCount")
        viewCountNode.text = str(viewCount)
        lyricsNode = ET.SubElement(doc, "lyrics")
        lyricsNode.text = lyricsText
        
                       
        #Construct the tree
        tree = ET.ElementTree(root)
        filename = lyricsDbPath + str(songInfo['lid']) + ".txt"        
        tree.write(filename, "utf-8")
        
    except:
        pass

示例#11

0

显示文件

文件： kindlereader.py 项目： zyjia/kindlereader

    def parse_summary(self, summary, ref):
        """处理文章内容，去除多余标签并处理图片地址"""

        soup = BeautifulSoup(summary)

        for span in list(soup.findAll(attrs={"style": "display: none;"})):
            span.extract()

        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr: True}):
                del x[attr]

        for tag in soup.findAll(self.remove_tags):
            tag.extract()

        img_count = 0
        images = []
        for img in list(soup.findAll('img')):
            if (krconfig.max_image_per_article >= 0  and img_count >= krconfig.max_image_per_article) \
                or img.has_key('src') is False :
                img.extract()
            else:
                try:
                    if img['src'].encode('utf-8').lower().endswith(
                        ('jpg', 'jpeg', 'gif', 'png', 'bmp')):
                        localimage, fullname = self.parse_image(img['src'])
                        # 确定结尾为图片后缀，防止下载非图片文件（如用于访问分析的假图片）
                        if os.path.isfile(fullname) is False:
                            images.append({
                                'url': img['src'],
                                'filename': fullname,
                                'referer': ref
                            })
                        if localimage:
                            img['src'] = localimage
                            img_count = img_count + 1
                        else:
                            img.extract()
                    else:
                        img.extract()
                except Exception, e:
                    logging.info("error: %s" % e)
                    img.extract()

示例#12

0

显示文件

def parse_organic_contents(raw_content, organic_pos):
    data_dict = {}
    data_dict['position'] = organic_pos

    b = BeautifulSoup(raw_content)
    rtitle = b.find('a')
    headline = p.sub('', str(rtitle))
    data_dict['title'] = headline

    display_url = parse_display_url(str(raw_content))
    data_dict['display_url'] = display_url

    rhref = b.find('a', href=True)
    url = str(rhref['href'])
    data_dict['url'] = ul.unquote(url)

    rtext = b.findAll('div', {'class': 's'})
    text = p.sub('', str(rtext))
    data_dict['text'] = text.replace(']', '').replace('[', '')
    return data_dict

示例#13

0

显示文件

文件： getStatisticsFromYT.py 项目： shuboc/WebMiningFinal

def getViewCount(songTitle):

    try:
        youtube = 'http://gdata.youtube.com/feeds/api/videos?v=2&max-results=1&q='
        #songTitle = urllib2.quote(songTitle)
        #print songTitle
        url = youtube + songTitle
        #print url

        web = urllib2.urlopen(url)
        content = web.read()
        web.close()

        soup = BeautifulSoup(content)
        stats = soup.findAll('yt:statistics')

        return int(stats[0]['viewcount'])

    except:
        return 0

示例#14

0

显示文件

文件： exercise_page.py 项目： OpenDSA/OpenDSA-devserver

    def parse_response(self):
        soup = BeautifulSoup(self.response)

        head = soup.find("head")

        self.max_points = int(
            _get_value_from_soup(head, "meta", "value", {"name": "max-points"},
                                 0))

        if _get_value_from_soup(head, "meta", "value",
                                {"name": "status"}) == "accepted":
            self.is_accepted = True

        meta_title = _get_value_from_soup(head, "meta", "content",
                                          {"name": "DC.Title"})
        if meta_title:
            self.meta["title"] = meta_title
        else:
            title = soup.find("title")
            if title:
                self.meta["title"] = title.contents

        self.meta["description"] = _get_value_from_soup(
            head, "meta", "content", {"name": "DC.Description"}, "")

        points = _get_value_from_soup(head, "meta", "value",
                                      {"name": "points"})
        if points != None:
            self.points = int(points)
            self.is_graded = True
            self.is_accepted = True

        exercise_div = soup.body.find("div", {"id": "exercise"})

        if exercise_div != None:
            self.content = exercise_div.renderContents()
        else:
            self.content = soup.body.renderContents()

示例#15

0

显示文件

def GEN(book=None, prov=None):
    errmsg = ''
    provider = "libgen.io"
    if prov is None:
        prov = 'GEN'
    host = lazylibrarian.CONFIG[prov + '_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    search = lazylibrarian.CONFIG[prov + '_SEARCH']
    if not search or not search.endswith('.php'):
        search = 'search.php'
    if 'index.php' not in search and 'search.php' not in search:
        search = 'search.php'
    if search[0] == '/':
        search = search[1:]

    page = 1
    results = []
    next_page = True

    while next_page:
        if 'index.php' in search:
            params = {
                "s": book['searchterm'],
                "f_lang": "All",
                "f_columns": 0,
                "f_ext": "All"
            }
        else:
            params = {
                "view": "simple",
                "open": 0,
                "phrase": 0,
                "column": "def",
                "res": 100,
                "req": book['searchterm']
            }

        if page > 1:
            params['page'] = page

        providerurl = url_fix(host + "/%s" % search)
        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug(u"No results found from %s for %s" %
                             (provider, book['searchterm']))
            elif '111' in result:
                # looks like libgen has ip based access limits
                logger.error(
                    'Access forbidden. Please wait a while before trying %s again.'
                    % provider)
                errmsg = result
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching page data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if result:
            logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            try:
                soup = BeautifulSoup(result)
                try:
                    table = soup.findAll('table')[2]  # un-named table
                    if table:
                        rows = table.findAll('tr')
                except IndexError:  # no results table in result page
                    rows = []

                if 'search.php' in search and len(rows) > 1:
                    rows = rows[1:]

                for row in rows:
                    author = ''
                    title = ''
                    size = ''
                    extn = ''
                    link = ''
                    td = row.findAll('td')
                    if 'index.php' in search and len(td) > 3:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[0].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                BeautifulStoneSoup(
                                    td[2].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            temp = str(td[4])
                            temp = temp.split('onmouseout')[1]
                            extn = temp.split('">')[1].split('(')[0]
                            size = temp.split('">')[1].split('(')[1].split(
                                ')')[0]
                            size = size.upper()
                            link = temp.split('href=')[1].split('"')[1]
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen index.php results: %s' %
                                str(e))

                    elif 'search.php' in search and len(td) > 8:
                        try:
                            res = str(
                                BeautifulStoneSoup(
                                    td[1].text,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            author = formatAuthorName(res)
                            title = str(
                                td[2]).split('>')[2].split('<')[0].strip()
                            title = str(
                                BeautifulStoneSoup(
                                    title,
                                    convertEntities=BeautifulStoneSoup.
                                    HTML_ENTITIES))
                            link = str(td[2]).split('href="')[1].split(
                                '?')[1].split('"')[0]
                            size = unaccented(td[7].text).upper()
                            extn = td[8].text
                        except IndexError as e:
                            logger.debug(
                                'Error parsing libgen search.php results; %s' %
                                str(e))

                    if not size:
                        size = 0
                    else:
                        try:
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0

                    if link and title:
                        if author:
                            title = author.strip() + ' ' + title.strip()
                        if extn:
                            title = title + '.' + extn

                        if not link.startswith('http'):
                            if "/ads.php?" in link:
                                url = url_fix(host + link)
                            else:
                                url = url_fix(host + "/ads.php?" + link)
                        else:
                            url = redirect_url(host, link)

                        bookresult, success = fetchURL(url)
                        if not success:
                            # may return 404 if no results, not really an error
                            if '404' in bookresult:
                                logger.debug(
                                    u"No results found from %s for %s" %
                                    (provider, book['searchterm']))
                            else:
                                logger.debug(url)
                                logger.debug(
                                    'Error fetching link data from %s: %s' %
                                    (provider, bookresult))
                                errmsg = bookresult
                            bookresult = False

                        if bookresult:
                            url = None
                            try:
                                new_soup = BeautifulSoup(bookresult)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output:
                                        if output.startswith(
                                                'http'
                                        ) and '/get.php' in output:
                                            url = output
                                            break
                                        elif '/get.php' in output:
                                            url = '/get.php' + output.split(
                                                '/get.php')[1]
                                            break
                                        elif '/download/book' in output:
                                            url = '/download/book' + output.split(
                                                '/download/book')[1]
                                            break

                                if url and not url.startswith('http'):
                                    url = url_fix(host + url)
                                else:
                                    url = redirect_url(host, url)
                            except Exception as e:
                                logger.debug(
                                    'Error parsing bookresult for %s: %s' %
                                    (link, str(e)))
                                url = None

                        if url:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider + '/' + search,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'direct',
                                'priority':
                                lazylibrarian.CONFIG[prov + '_DLPRIORITY']
                            })
                            logger.debug('Found %s, Size %s' % (title, size))
                        next_page = True

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))
                logger.debug('%s: %s' % (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results, errmsg

示例#16

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def KAT(book=None):

    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" + book['searchterm'])

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)

        try:
            table = soup.findAll('table')[1]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c0 = []
        c1 = []
        c3 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 3:
                    c0.append(row.findAll('td')[0])
                    c1.append(row.findAll('td')[1])
                    c3.append(row.findAll('td')[3])

        for col0, col1, col3 in zip(c0, c1, c3):
            try:
                title = unaccented(
                    str(col0).split('cellMainLink">')[1].split('<')[0])
                # kat can return magnet or torrent or both.
                magnet = ''
                url = ''
                mode = 'torrent'
                try:
                    magnet = 'magnet' + str(col0).split(
                        'href="magnet')[1].split('"')[0]
                    mode = 'magnet'
                except IndexError:
                    pass
                try:
                    url = 'http' + str(col0).split('href="http')[1].split(
                        '.torrent?')[0] + '.torrent'
                    mode = 'torrent'
                except IndexError:
                    pass

                if not url or (magnet and url
                               and lazylibrarian.CONFIG['PREFER_MAGNET']):
                    url = magnet
                    mode = 'magnet'

                try:
                    size = str(col1.text).replace('&nbsp;', '').upper()
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col3.text)
                except ValueError:
                    seeders = 0

                if not url or not title:
                    logger.debug('Missing url or title')
                elif minimumseeders < seeders:
                    results.append({
                        'bookid': book['bookid'],
                        'tor_prov': provider,
                        'tor_title': title,
                        'tor_url': url,
                        'tor_size': str(size),
                        'tor_type': mode
                    })
                    logger.debug('Found %s. Size: %s' % (title, size))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#17

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def TPB(book=None):

    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?q=" + book['searchterm'])

    params = {"category": "601", "page": "0", "orderby": "99"}
    searchURL = providerurl + "&%s" % urllib.urlencode(params)

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[0]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 2:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])

        for col1, col2 in zip(c1, c2):
            try:
                title = unaccented(
                    str(col1).split('title=')[1].split('>')[1].split('<')[0])
                magnet = str(col1).split('href="')[1].split('"')[0]
                size = unaccented(col1.text.split(', Size ')[1].split('iB')[0])
                mult = 1
                try:
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0
                try:
                    seeders = int(col2.text)
                except ValueError:
                    seeders = 0

                if minimumseeders < seeders:
                    # no point in asking for magnet link if not enough seeders
                    magurl = '%s/%s' % (host, magnet)
                    result, success = fetchURL(magurl)
                    if not success:
                        logger.debug('Error fetching url %s, %s' %
                                     (magurl, result))
                    else:
                        magnet = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                magnet = output
                                break
                    if not magnet or not title:
                        logger.debug('Missing magnet or title')
                    else:
                        if minimumseeders < seeders:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': magnet,
                                'tor_size': str(size),
                                'tor_type': 'magnet'
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                else:
                    logger.debug('Found %s but %s seeder%s' %
                                 (title, seeders, plural(seeders)))
            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#18

0

显示文件

文件： kindlereader.py 项目： zyjia/kindlereader

    def makelocal(self, feed_data, feed_idx, force_full_text=0):
        '''生成解析结果'''
        global updated_feeds
        global feedlock

        try:
            local = {
                'idx': feed_idx,
                'entries': [],
                'title': feed_data.feed['title'],
            }

            item_idx = 1
            for entry in feed_data.entries:
                if item_idx > krconfig.max_items_number:
                    break

                try:
                    published_datetime = datetime(*entry.published_parsed[0:6])
                except:
                    published_datetime = self.parsetime(entry.published)

                if datetime.utcnow(
                ) - published_datetime > krconfig.max_old_date:
                    break

                try:
                    local_author = entry.author
                except:
                    local_author = "null"

                local_entry = {
                    'idx':
                    item_idx,
                    'title':
                    entry.title,
                    'published':
                    (published_datetime +
                     krconfig.timezone).strftime("%Y-%m-%d %H:%M:%S"),
                    'url':
                    entry.link,
                    'author':
                    local_author,
                }

                if force_full_text:
                    local_entry['content'], images = self.force_full_text(
                        entry.link)
                else:
                    try:
                        local_entry['content'], images = self.parse_summary(
                            entry.content[0].value, entry.link)
                    except:
                        local_entry['content'], images = self.parse_summary(
                            entry.summary, entry.link)

                local_entry['stripped'] = ''.join(
                    BeautifulSoup(
                        local_entry['content'],
                        convertEntities=BeautifulSoup.HTML_ENTITIES).findAll(
                            text=True))[:200]

                local['entries'].append(local_entry)
                for i in images:
                    imgq.put(i)
                item_idx += 1

            if len(local['entries']) > 0:
                if feedlock.acquire():
                    updated_feeds.append(local)
                    feedlock.release()
                else:
                    feedlock.release()
                logging.info("from feed{} update {} items.".format(
                    feed_idx, len(local['entries'])))
            else:
                logging.info("feed{} has no update.".format(feed_idx))
        except Exception, e:
            logging.error("fail(feed{}): {}".format(feed_idx, e))

示例#19

0

显示文件

文件： torrentparser.py 项目： kuuratsanik/LazyLibrarian

def GEN(book=None):

    provider = "libgen"
    host = lazylibrarian.CONFIG['GEN_HOST']
    if not str(host)[:4] == "http":
        host = 'http://' + host

    searchURL = url_fix(
        host +
        "/search.php?view=simple&open=0&phrase=0&column=def&res=100&req=" +
        book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in result:
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        elif '111' in result:
            # looks like libgen has ip based access limits
            logger.error(
                'Access forbidden. Please wait a while before trying %s again.'
                % provider)
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
        result = False

    results = []

    if result:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        soup = BeautifulSoup(result)
        try:
            table = soup.findAll('table')[2]
            rows = table.findAll('tr')
        except Exception:  # no results = no table in result page
            rows = []

        c1 = []
        c2 = []
        c7 = []
        c8 = []

        if len(rows) > 1:
            for row in rows[1:]:
                if len(row.findAll('td')) > 8:
                    c1.append(row.findAll('td')[1])
                    c2.append(row.findAll('td')[2])
                    c7.append(row.findAll('td')[7])
                    c8.append(row.findAll('td')[8])

        for col1, col2, col7, col8 in zip(c1, c2, c7, c8):
            try:
                author = unaccented(col1.text)
                title = unaccented(
                    str(col2).split('>')[2].split('<')[0].strip())
                link = str(col2).split('href="')[1].split('?')[1].split('"')[0]
                size = unaccented(col7.text).upper()
                extn = col8.text

                try:
                    mult = 1
                    if 'K' in size:
                        size = size.split('K')[0]
                        mult = 1024
                    elif 'M' in size:
                        size = size.split('M')[0]
                        mult = 1024 * 1024
                    size = int(float(size) * mult)
                except (ValueError, IndexError):
                    size = 0

                if link and title:
                    if author:
                        title = author.strip() + ' ' + title.strip()
                    if extn:
                        title = title + '.' + extn

                    bookURL = url_fix(host + "/ads.php?" + link)
                    bookresult, success = fetchURL(bookURL)
                    if not success:
                        # may return 404 if no results, not really an error
                        if '404' in bookresult:
                            logger.debug(u"No results found from %s for %s" %
                                         (provider, book['searchterm']))
                        else:
                            logger.debug(bookURL)
                            logger.debug('Error fetching data from %s: %s' %
                                         (provider, bookresult))
                        bookresult = False
                    if bookresult:
                        url = None
                        new_soup = BeautifulSoup(bookresult)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('/get.php'):
                                url = output
                                break

                        if url:
                            url = url_fix(host + url)
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                                'tor_type': 'direct'
                            })
                            logger.debug('Found %s, Size %s' % (title, size))

            except Exception as e:
                logger.error(u"An error occurred in the %s parser: %s" %
                             (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#20

0

显示文件

def view():		
	addon_handle = int(sys.argv[1])
	addon       = xbmcaddon.Addon()
	addonname   = addon.getAddonInfo('name')
	
	args = urlparse.parse_qs(sys.argv[2][1:])

	xbmcplugin.setContent(addon_handle, 'movies')

	cat=args.get('cat', None)
	page = args.get('page', None)
	link = args.get('link', None)	
	
	catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'},
				{'label':'Video Hot','id':'video/hot/'}]
	#play link
	if link!=None:
		link_video=link[0]
		if link_video.startswith(web_url):
			r = requests.get(link[0])
			html = r.text
			#xbmc.log(html.encode('utf-8'))
			soup = BeautifulSoup(html)
			video_src=soup.find('embed', attrs={'id':'zplayer'})
			video_flashvars=video_src.get('flashvars')
			args_video = urlparse.parse_qs(video_flashvars)
			link_video=args_video['file'][0]					
		xbmc.Player().play(link_video)
		return
	#Load cats
	if cat==None:
		for cat in catalogues:
			li = xbmcgui.ListItem(cat['label'])
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		xbmc.executebuiltin('Container.SetViewMode(501)')		 			
		xbmcplugin.endOfDirectory(addon_handle)
		return
	#Load noi dung cat
	if cat!=None:
		if page==None:
			page=1
		else:
			page=int(page[0])
		r = requests.get(web_url+cat[0]+str(page))
		html = r.text		
		xbmc.log(html.encode('utf-8'))
		soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)			
		data_List=soup.findAll('a',attrs={'class':'play'})
		#load item menu
		for item in data_List:			
			link_item=web_url+item.get('href')			
			if item.get('data-youtubeid')!='':
				link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid')
			img_item=item.find('img')
			img_src=img_item.get('src')
			img_alt=img_item.get('alt')
			
			li = xbmcgui.ListItem(img_alt)
			
			li.setThumbnailImage(img_src)
			li.setInfo(type='image',infoLabels="")					
			
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li)			
		
		#Tao nut next	
		li = xbmcgui.ListItem("Next")	
		urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1});
		xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		
		xbmc.executebuiltin('Container.SetViewMode(501)')
		#xbmc.executebuiltin("ClearSlideshow")		
		#xbmc.executebuiltin("SlideShow(,,notrandom)")		
		xbmcplugin.endOfDirectory(addon_handle)
		return
					
	xbmcplugin.endOfDirectory(addon_handle)

示例#21

0

显示文件

def soup(string, **kwargs):
    """Create a BeautifulSoup parse object from a string"""
    from lib.BeautifulSoup import BeautifulSoup    
    return BeautifulSoup(string, **kwargs)

示例#22

0

显示文件

def TDL(book=None, test=False):
    errmsg = ''
    provider = "torrentdownloads"
    host = lazylibrarian.CONFIG['TDL_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    data, success = fetchURL(searchURL)
    if not success:
        # may return 404 if no results, not really an error
        if '404' in data:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' % (provider, data))
            errmsg = data
        data = False

    if test:
        return success

    results = []

    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    if data:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < int(seeders):
                        # no point requesting the magnet link if not enough seeders
                        # TDL gives us a relative link
                        result, success = fetchURL(providerurl + link)
                        if success:
                            new_soup = BeautifulSoup(result)
                            for link in new_soup.findAll('a'):
                                output = link.get('href')
                                if output and output.startswith('magnet'):
                                    url = output
                                    break

                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                'magnet',
                                'priority':
                                lazylibrarian.CONFIG['TDL_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))

    return results, errmsg

示例#23

0

显示文件

def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.findAll('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.findAll('td')
                if len(td) > 2:
                    try:
                        title = unaccented(
                            str(td[1]).split('title=')[1].split('>')[1].split(
                                '<')[0])
                        magnet = str(td[1]).split('href="')[1].split('"')[0]
                        size = unaccented(
                            td[1].text.split(', Size ')[1].split('iB')[0])
                        size = size.replace('&nbsp;', '')
                        mult = 1
                        try:
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if minimumseeders < int(seeders):
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' %
                                             (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid':
                                    book['bookid'],
                                    'tor_prov':
                                    provider,
                                    'tor_title':
                                    title,
                                    'tor_url':
                                    magnet,
                                    'tor_size':
                                    str(size),
                                    'tor_type':
                                    'magnet',
                                    'priority':
                                    lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' %
                                             (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#24

0

显示文件

def WWT(book=None, test=False):
    errmsg = ''
    provider = "WorldWideTorrents"
    host = lazylibrarian.CONFIG['WWT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/torrents-search.php")

    sterm = makeUnicode(book['searchterm'])

    cat = 0  # 0=all, 36=ebooks, 52=mags, 56=audiobooks
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 56
        elif book['library'] == 'eBook':
            cat = 36
        elif book['library'] == 'magazine':
            cat = 52

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:
        params = {"search": book['searchterm'], "page": page, "cat": cat}
        searchURL = providerurl + "/?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)
        if not success:
            # might return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)

            try:
                tables = soup.findAll('table')  # un-named table
                table = tables[2]
                if table:
                    rows = table.findAll('tr')
            except IndexError:  # no results table in result page
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers

            for row in rows:
                td = row.findAll('td')
                if len(td) > 3:
                    try:
                        title = unaccented(
                            str(td[0]).split('title="')[1].split('"')[0])
                        # can return magnet or torrent or both.
                        magnet = ''
                        url = ''
                        mode = 'torrent'
                        try:
                            magnet = 'magnet' + str(
                                td[0]).split('href="magnet')[1].split('"')[0]
                            mode = 'magnet'
                        except IndexError:
                            pass
                        try:
                            url = url_fix(host + '/download.php') + \
                                          str(td[0]).split('href="download.php')[1].split('.torrent"')[0] + '.torrent'
                            mode = 'torrent'
                        except IndexError:
                            pass

                        if not url or (magnet and url and
                                       lazylibrarian.CONFIG['PREFER_MAGNET']):
                            url = magnet
                            mode = 'magnet'

                        try:
                            size = str(td[1].text).replace('&nbsp;',
                                                           '').upper()
                            mult = 1
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if not url or not title:
                            logger.debug('Missing url or title')
                        elif minimumseeders < int(seeders):
                            results.append({
                                'bookid':
                                book['bookid'],
                                'tor_prov':
                                provider,
                                'tor_title':
                                title,
                                'tor_url':
                                url,
                                'tor_size':
                                str(size),
                                'tor_type':
                                mode,
                                'priority':
                                lazylibrarian.CONFIG['WWT_DLPRIORITY']
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                            next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))
        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#25

0

显示文件

def KAT(book=None, test=False):
    errmsg = ''
    provider = "KAT"
    host = lazylibrarian.CONFIG['KAT_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/usearch/" +
                          urllib.quote(book['searchterm']))

    params = {"category": "books", "field": "seeders", "sorder": "desc"}
    searchURL = providerurl + "/?%s" % urllib.urlencode(params)

    sterm = makeUnicode(book['searchterm'])

    result, success = fetchURL(searchURL)
    if not success:
        # seems KAT returns 404 if no results, not really an error
        if '404' in result:
            logger.debug("No results found from %s for %s" % (provider, sterm))
            success = True
        else:
            logger.debug(searchURL)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, result))
            errmsg = result
        result = False

    if test:
        return success

    results = []

    if result:
        logger.debug('Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
        soup = BeautifulSoup(result)
        rows = []
        try:
            table = soup.findAll('table')[1]  # un-named table
            if table:
                rows = table.findAll('tr')
        except IndexError:  # no results table in result page
            rows = []

        if len(rows) > 1:
            rows = rows[1:]  # first row is headers

        for row in rows:
            td = row.findAll('td')
            if len(td) > 3:
                try:
                    title = unaccented(
                        str(td[0]).split('cellMainLink">')[1].split('<')[0])
                    # kat can return magnet or torrent or both.
                    magnet = ''
                    url = ''
                    mode = 'torrent'
                    try:
                        magnet = 'magnet' + str(
                            td[0]).split('href="magnet')[1].split('"')[0]
                        mode = 'magnet'
                    except IndexError:
                        pass
                    try:
                        url = 'http' + str(td[0]).split('href="http')[1].split(
                            '.torrent?')[0] + '.torrent'
                        mode = 'torrent'
                    except IndexError:
                        pass

                    if not url or (magnet and url
                                   and lazylibrarian.CONFIG['PREFER_MAGNET']):
                        url = magnet
                        mode = 'magnet'

                    try:
                        size = str(td[1].text).replace('&nbsp;', '').upper()
                        mult = 1
                        if 'K' in size:
                            size = size.split('K')[0]
                            mult = 1024
                        elif 'M' in size:
                            size = size.split('M')[0]
                            mult = 1024 * 1024
                        elif 'G' in size:
                            size = size.split('G')[0]
                            mult = 1024 * 1024 * 1024
                        size = int(float(size) * mult)
                    except (ValueError, IndexError):
                        size = 0
                    try:
                        seeders = int(td[3].text)
                    except ValueError:
                        seeders = 0

                    if not url or not title:
                        logger.debug('Missing url or title')
                    elif minimumseeders < int(seeders):
                        results.append({
                            'bookid':
                            book['bookid'],
                            'tor_prov':
                            provider,
                            'tor_title':
                            title,
                            'tor_url':
                            url,
                            'tor_size':
                            str(size),
                            'tor_type':
                            mode,
                            'priority':
                            lazylibrarian.CONFIG['KAT_DLPRIORITY']
                        })
                        logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))
                except Exception as e:
                    logger.error("An error occurred in the %s parser: %s" %
                                 (provider, str(e)))
                    logger.debug('%s: %s' % (provider, traceback.format_exc()))

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

示例#26

0

显示文件

def TDL(book=None):

    provider = "torrentdownloads"
    host = lazylibrarian.TDL_HOST
    if not str(host)[:4] == "http":
        host = 'http://' + host

    providerurl = url_fix(host)

    params = {"type": "search", "cid": "2", "search": book['searchterm']}
    searchURL = providerurl + "/rss.xml?%s" % urllib.urlencode(params)

    try:
        request = urllib2.Request(searchURL)
        if lazylibrarian.PROXY_HOST:
            request.set_proxy(lazylibrarian.PROXY_HOST,
                              lazylibrarian.PROXY_TYPE)
        request.add_header('User-Agent', USER_AGENT)
        data = urllib2.urlopen(request, timeout=90)
    except (socket.timeout) as e:
        logger.debug('Timeout fetching data from %s' % provider)
        data = False
    except (urllib2.HTTPError, urllib2.URLError, ssl.SSLError) as e:
        # may return 404 if no results, not really an error
        if hasattr(e, 'code') and e.code == 404:
            logger.debug(searchURL)
            logger.debug(u"No results found from %s for %s" %
                         (provider, book['searchterm']))
        else:
            logger.debug(searchURL)
            if hasattr(e, 'reason'):
                errmsg = e.reason
            else:
                errmsg = str(e)
            logger.debug('Error fetching data from %s: %s' %
                         (provider, errmsg))
        data = False

    results = []

    minimumseeders = int(lazylibrarian.NUMBEROFSEEDERS) - 1
    if data:
        logger.debug(u'Parsing results from <a href="%s">%s</a>' %
                     (searchURL, provider))
        d = feedparser.parse(data)
        if len(d.entries):
            for item in d.entries:
                try:
                    title = item['title']
                    seeders = int(item['seeders'])
                    link = item['link']
                    size = int(item['size'])
                    url = None

                    if link and minimumseeders < seeders:
                        # no point requesting the magnet link if not enough seeders
                        request = urllib2.Request(link)
                        if lazylibrarian.PROXY_HOST:
                            request.set_proxy(lazylibrarian.PROXY_HOST,
                                              lazylibrarian.PROXY_TYPE)
                        request.add_header('User-Agent', USER_AGENT)

                        conn = urllib2.urlopen(request, timeout=90)
                        result = conn.read()
                        url = None
                        new_soup = BeautifulSoup(result)
                        for link in new_soup.findAll('a'):
                            output = link.get('href')
                            if output and output.startswith('magnet'):
                                url = output
                                break

                    if minimumseeders < int(seeders):
                        if not url or not title:
                            logger.debug('Missing url or title')
                        else:
                            results.append({
                                'bookid': book['bookid'],
                                'tor_prov': provider,
                                'tor_title': title,
                                'tor_url': url,
                                'tor_size': str(size),
                            })
                            logger.debug('Found %s. Size: %s' % (title, size))
                    else:
                        logger.debug('Found %s but %s seeder%s' %
                                     (title, seeders, plural(seeders)))

                except Exception as e:
                    logger.error(u"An error occurred in the %s parser: %s" %
                                 (provider, str(e)))

    logger.debug(
        u"Found %i result%s from %s for %s" %
        (len(results), plural(len(results)), provider, book['searchterm']))
    return results

示例#27

0

显示文件

文件： android_source.py 项目： freebendy/python

prefixurl = "https://android.git.kernel.org/" # "git://android.git.kernel.org/" is so easy to lead to time out
currentdir = os.path.abspath(os.path.dirname(sys.argv[0])) #the dir of the source
repositorydir = ".git"
os.chdir(currentdir) # change the work directory, getcwd()

conn = httplib.HTTPConnection("android.git.kernel.org")
conn.request("GET","/")
res = conn.getresponse()

if res.status == httplib.OK:
    data = res.read();
    #print data
    conn.close()
    
    soup = BeautifulSoup(data)
    #print soup.prettify()
    table = soup.body.table
    #print soup.body.table
    # filter
    tags = table.findAll('a', attrs = {'class' : 'list', 'title': None , 'href' : re.compile('^/\?p')})
    #print tags
    projectlist = []
    for tag in tags:
        projectlist.append(tag.string) 
        
    file = open(currentdir+"/list.txt","w")
    #writelines won't add the '\n'
    file.writelines( map( lambda x: x.strip()+"\n", projectlist ) );
    file.close()

示例#28

0

显示文件

    except:
        pass

########################start of main###################################

for i in range(startId, endId):
    
    url = "http://lyrics.oiktv.com/singer.php?sid=" + str(i)

    #lyricsWeb = urllib2.urlopen("http://lyrics.oiktv.com/singer.php?sid=51")  
    lyricsWeb = urllib2.urlopen(url)
    
    webContent = lyricsWeb.read()  
    lyricsWeb.close()  
    
    soup = BeautifulSoup(webContent)
    
    pages = soup.findAll('a')
    wantedPages = []
    for page in pages:        
        if re.search("&page=", page['href']):
            #print page['href']
            url = page['href']
            wantedPages.append(url)
            
    if len(wantedPages) > 1: #find those who has more than 20 albums    
        
        maxPageNum = 1 #Max 1 page for each singer
        pageNum = 0
        maxSongNum = 250
        songNum = 0

示例#29

0

显示文件

文件： DetermineMinSDK.py 项目： ziv0chou/qark

def determine_min_sdk():
    """
	Determines the minimum SDK version supported by the vulnerable application\n
	As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml
	"""
    #determine minimum supported versions
    common.minSdkVersion = 0
    common.sdk = common.xmldoc.getElementsByTagName("uses-sdk")
    determineSdk = ''

    if len(common.sdk) > 0:
        if 'android:minSdkVersion' in common.sdk[0].attributes.keys():
            try:
                common.minSdkVersion = common.sdk[0].attributes[
                    'android:minSdkVersion'].value
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
            except Exception as e:
                common.logger.error(
                    "Something went wrong trying to determine the version from the manifest: "
                    + str(e))

    if common.minSdkVersion == 0:
        if common.source_or_apk == 2:
            common.minSdkVersion = find_gradle()
            if common.minSdkVersion == 0:
                common.logger.info(
                    "We were unable to find the minimum SDK version in your source."
                )
                determineSdk = 'm'
            else:
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
        else:
            common.compare(common.sdk.length, 1,
                           common.config.get('qarkhelper', 'USESDK_MISS'),
                           'false')
            print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN')
            while True:
                determineSdk = raw_input(
                    "Which option would you prefer? (P)lay, (M)anual")
                if determineSdk.lower() in ('p', 'm'):
                    break
                else:
                    determineSdk = raw_input("Please enter either (p) or (m):")

        if determineSdk.lower() == 'p':
            #get package name from manifest if possible
            #make call to Play store
            #determine API version from https://play.google.com/store/apps/details?id=<package name>
            # will need to adjust the sdk[0] value for the checks below
            for a in common.xmldoc.getElementsByTagName('manifest'):
                if 'package' in a.attributes.keys():
                    print common.config.get('qarkhelper', 'PACK_FOUND')
                    package_name = a.attributes['package'].value
                    print package_name
                else:
                    package_name = raw_input(
                        common.config.get('qarkhelper', 'NO_PACK_NAME'))

            try:
                logger.info(
                    common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION'))
                play_url = "https://play.google.com/store/apps/details?id="
                play_url += package_name
                print play_url
                page = urllib2.urlopen(play_url)
                html = BeautifulSoup(page.read())
                play_version = html.find(itemprop="operatingSystems")
                plat_version = re.findall('\d+.\d+', play_version.contents[0])
                if plat_version:
                    plat_version = [str(item) for item in plat_version]
                    api_plat_map = []
                    api_plat_map.append(['1', '1.0'])
                    api_plat_map.append(['2', '1.1'])
                    api_plat_map.append(['3', '1.5'])
                    api_plat_map.append(['4', '1.6'])
                    api_plat_map.append(['5', '2.0'])
                    api_plat_map.append(['6', '2.0.1'])
                    api_plat_map.append(['7', '2.1'])
                    api_plat_map.append(['8', '2.2'])
                    api_plat_map.append(['9', '2.3'])
                    api_plat_map.append(['10', '2.3.3'])
                    api_plat_map.append(['11', '3.0'])
                    api_plat_map.append(['12', '3.1'])
                    api_plat_map.append(['13', '3.2'])
                    api_plat_map.append(['14', '4.0'])
                    api_plat_map.append(['15', '4.0.3'])
                    api_plat_map.append(['16', '4.1'])
                    api_plat_map.append(['17', '4.2'])
                    api_plat_map.append(
                        ['18', '4.3']
                    )  #Webviews have critical vuln, no more patches from Google
                    api_plat_map.append(['19', '4.4'])
                    api_plat_map.append(
                        ['20', '4.4']
                    )  # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes
                    api_plat_map.append(['21', '5.0'])
                    api_plat_map.append(
                        ['22', '5.1']
                    )  # This is latest version, we'll assume this for newer, until update
                    #TODO - double check this, adding 5.1 may have broken it
                    for a in api_plat_map:
                        if StrictVersion(str(
                                plat_version[0])) >= StrictVersion(str(a[1])):
                            common.minSdkVersion = a[0]
                    logger.info(
                        common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                        str(common.minSdkVersion))
                    manual = raw_input(
                        common.config.get('qarkhelper', 'SDK_VALUE_MANUAL'))
                else:
                    print common.config.get('qarkhelper', 'CANT_DET_PLAY')
                    #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken
            except HTTPError, e:
                print str(e)
                logger.error(
                    common.config.get('qarkhelper',
                                      'MIN_SDK_PLAY_STORE_FAILED'))
        elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0):
            #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case
            print common.term.cyan + common.term.bold + str(
                common.config.get('qarkhelper', 'NO_MIN_SDK')).decode(
                    'string-escape').format(t=common.term)
            enterSdk = raw_input(
                common.config.get('qarkhelper', 'PROMPT_MIN_SDK'))
            if enterSdk.lower() == 'y':
                sdkinput = 0
                while True:
                    sdkinput = int(
                        raw_input(
                            common.config.get('qarkhelper', 'PROMPT_VER') +
                            common.config.get('qarkhelper', 'MAX_API_VERSION')
                            + common.config.get('qarkhelper', 'PROMPT_VER2')))
                    if 0 < int(sdkinput) <= int(
                            common.config.get('qarkhelper',
                                              'MAX_API_VERSION')):
                        common.minSdkVersion = int(sdkinput)
                        break
            else:
                common.minSdkVersion = 7

示例#30

0

显示文件

 def get_charset_from_html(self, html):
     return BeautifulSoup(html).originalEncoding