예제 #1
0
    def parse_body(self) -> str:
        """Look for article's body.

        :raise ~.ArticleBodyMissing: when no body is found.
        """
        parser = CSSSelector('html body div#content div.sect1')
        body = parser(self.source)

        try:
            assert body
        except AssertionError:
            raise exceptions.ArticleBodyMissing(self)

        body = ''.join(lxml.etree.tounicode(section) for section in body)

        return body
예제 #2
0
async def get_image(queue):
    """When there is an item in queue, parse to get comics url"""
    while True:
        payload = await queue.get()
        await asyncio.sleep(0)
        body, selector = payload
        tree = html.fromstring(body)
        select = CSSSelector(selector)
        elements = [e.get('src') for e in select(tree)]
        if len(elements) > 0:
            url = elements[0]
            if not (url.startswith('http:') or url.startswith('https:')):
                url = 'https:' + url
            print('  ', url)
        # ack queue that the item has been processed
        queue.task_done()
def addImageList(root):
    """ Add an image list based on the <caption> tags """

    images = list()

    count = 0
    for caption in root.xpath('//span'):
        # <span> with image captions may contain several css classes. Unfortunately
        # BeautifulSoup is unable to find elements by-CSS-class if the related element
        # contains more than one CSS class
        if not 'image-caption-with-description' in caption.get('class', ''):
            continue
        text = caption.text_content()
        id = 'image-%d' % count
        new_anchor = lxml.html.Element('a')
        new_anchor.attrib['name'] = id
        caption.insert(0, new_anchor)
        images.append(dict(text=text, id=id))
        count += 1

    div_images = lxml.html.Element('div')
    div_images.attrib['id'] = 'images-list'
    div_ul = lxml.html.Element('ul')
    div_images.append(div_ul)

    if images:
        for d in images:
            li = lxml.html.Element('li')
            li.attrib['class'] = 'image-list-entry'
            a = lxml.html.Element('a')
            a.attrib['href'] = '#' + d['id']
            a.attrib['class'] = 'image-list-entry'
            span = lxml.html.Element('span')
            span.text = d['text']
            a.insert(0, span)
            li.append(a)
            div_ul.append(li)

        # check for an existing div#image-list)
        nodes = CSSSelector('div#image-list')(root)
        if nodes:
            # replace it
            nodes[0].replace(nodes[0], div_images)
        else:
            # add to end of document
            body = root.xpath('//body')[0]
            body.append(div_images)
예제 #4
0
def parse_highlights_html(html):
    node = lxml.html.fromstring(html)
    useless_nodes = (list(CSSSelector('.addEditNote')(node)) +
                     list(CSSSelector('.editNote')(node)) +
                     list(CSSSelector('script')(node)) +
                     list(CSSSelector('#header')(node)) +
                     list(CSSSelector('form')(node)))
    for useless in useless_nodes:
        useless.getparent().remove(useless)
    return parse_element_into_books(
        CSSSelector('#allHighlightedBooks')(node)[0])
예제 #5
0
def getkleague():
    print '<Get Kleague>'
    r = requests.get('http://www.kleague.com/')
    html = lxml.html.fromstring(r.text)
    sel = CSSSelector('div#modal_classic-team-rank table tbody tr')
    nodes = sel(html)
    print u"테이블 행 갯수: ", len(nodes)
    counter = 0
    for teams in nodes:
        for cols in teams:
            if cols.xpath('.//a/text()'):
                print cols.xpath('.//a/text()')[0],
            elif cols.xpath('.//span/text()'):
                print cols.xpath('.//span/text()')[0],
            else:
                print cols.text.strip(),
        print
예제 #6
0
def on_sale(tree):
    '''
    a separate element, when present. this data gets updated daily,
    so could change. and if we're collecting price, it would be incomplete
    of us not to check this

    Args:
    the parsed tree element of a product page

    Returns:
    if on sale: a string containing the sale price
    '''
    sale_element = CSSSelector('ul li.newsFont b font')(tree)
    if len(sale_element) > 0:
        return sale_element[0].text
    else:
        return False
예제 #7
0
    def on_process(self):
        self.text_edit_output.clear()
        self.label_error.clear()
        self.button_detail_error.hide()

        self.last_error_message = None
        self.last_detail_error_message = None

        try:
            text = self.text_edit_input.toPlainText()
            if not text or not self.le_xpath_css.text():
                return

            search_text = self.le_xpath_css.text()

            is_html_parser = self.rb_parser_html.isChecked()

            if is_html_parser:
                root = html.fromstring(text)
            else:
                root = etree.fromstring(text)

            if self.rb_xpath.isChecked():
                result = root.xpath(search_text)
            else:
                selector = CSSSelector(search_text, translator='html' if is_html_parser else 'xml')
                result = selector(root)

            print(len(result), result)

            result = map(to_str, result)
            output = '\n'.join('{}. {}'.format(i, x) for i, x in enumerate(result, 1))
            self.text_edit_output.setPlainText(output)

        except Exception as e:
            # # Выводим ошибку в консоль
            # traceback.print_exc()

            # Сохраняем в переменную
            tb = traceback.format_exc()

            self.last_error_message = str(e)
            self.last_detail_error_message = str(tb)
            self.button_detail_error.show()

            self.label_error.setText('Error: ' + self.last_error_message)
예제 #8
0
def pullup_elems(tree, loader_context):
    for elem_child, parent_dist in loader_context.get("pullup_elems",
                                                      {}).items():
        selector = CSSSelector(elem_child)
        for elem in selector(tree):
            parent = elem
            for _ in range(parent_dist):
                parent = parent.getparent()
            if parent is not None and parent.getparent() is not None:
                elem.tail = parent.tail
                parent.getparent().replace(parent, elem)
            else:
                logger.error(
                    'Could not find parent with distance {} for selector "{}".'
                    .format(parent_dist, elem_child))

    return [tree]
예제 #9
0
def getImages(_htmlobj, path, _base):
    sel = CSSSelector("img")
    htmlobj = _htmlobj
    images = sel(htmlobj)
    for image in images:
        originalsrc = image.attrib["src"]
        if validImageFormat(originalsrc):
            if _base != "":
                fileUrl = getFileUrl(_base, originalsrc)
            else:
                fileUrl = getFileUrl(htmlobj.base, originalsrc)

            fileName = path + fileUrl.split('/')[-1].replace('%20', '_')
            staticUrl = '/static/' + fileName.split('/')[-1]
            saveFile(fileUrl, fileName)
            image.attrib['src'] = staticUrl
    return htmlobj
예제 #10
0
def check_for_unicorn(tree):
    '''
    gives us a quick thumbs-up or thumbs-down on whether each product page
    contains a unicorn.

    Args:
    the parsed DOM tree of a product page
    '''
    check_unicorn = CSSSelector('body table tr td table tr td font')(tree)
    is_unicorn = False
    try:
        if 'one Location' in check_unicorn[0].text:
            is_unicorn = True
        return is_unicorn
    except IndexError:
        print 'no text to check here? that would be weird.'
        print[c.text for c in check_unicorn]
예제 #11
0
def getCoinInfo(coin):
    url = 'https://www.feixiaohao.com/currencies/' + coin
    ret = requests.get(url)

    tree = html.fromstring(ret.text)

    h24 = tree.xpath(
        '//*[@id="baseInfo"]/div[1]/div[1]/div[3]/div[1]/span/text()')[0]
    l24 = tree.xpath(
        '//*[@id="baseInfo"]/div[1]/div[1]/div[3]/div[2]/span/text()')[0]

    percentage = tree.xpath(
        '//*[@id="baseInfo"]/div[1]/div[2]/div[5]/div/span/text()')[0]
    flowRate = tree.xpath(
        '//*[@id="baseInfo"]/div[1]/div[3]/div[5]/div/span/text()')[0]
    turnoverRate = tree.xpath(
        '//*[@id="baseInfo"]/div[1]/div[4]/div[5]/div/span/text()')[0]

    icoInfo = tree.xpath('/html/body/div[5]/div/div[6]/table')
    info = {}
    if len(icoInfo) > 0:
        sel = CSSSelector('tr td')
        tds = sel(icoInfo[0])
        vals = [
            "status", "platform", "icoDistribute", "investPercentage",
            "icoTotal", "icoSupply", "icoStart", "icoStop", "icoOpenPrice",
            "icoMethod", "icoTarget", "icoVolume", "icoAveragePrice",
            "icoSuccessCount", "icoSuccessVolume", "characteristic",
            "security", "law", "area", "consultant", "sellAgentAddress",
            "blogAddress"
        ]
        for index, td in enumerate(tds):
            if index < len(vals):
                text = "" if td.text is None else td.text
                info[vals[index]] = text

    result = {
        "coin": coin,
        "h24": h24,
        "l24": l24,
        "percentage": percentage,
        "flowRate": flowRate,
        "turnoverRate": turnoverRate,
        "info": str(info)
    }
    return result
예제 #12
0
    def prepare_text(self, obj):
        """Return text for indexing.

        Args:
            obj (GeneralPage): Object for indexing.

        Returns:
            String for indexing.
        """
        rendered = render_to_string(obj.template, {"LANGUAGE_CODE": "en"})
        html = fromstring(rendered)
        selector = CSSSelector("#general-page-content")
        try:
            contents = selector(html)[0].text_content()
        except IndexError:
            raise TemplateSyntaxError(CONTENT_NOT_FOUND_ERROR_MESSAGE)
        return contents
예제 #13
0
def scrappWeb(url, filter, path, base):
    try:
        webdata = urllib2.urlopen(url)
    except:
        return url
    webstring = webdata.read()
    if filter == u'':
        return webstring
    #nyapa da morte for the android course.
    try:
        htmlobj = html.fromstring(
            webstring.replace(u'font-size: 12px', u'font-size: 12pt'))
        sel = CSSSelector(filter)
        htmlobj = getImages(sel(htmlobj)[0], path, base)
        return html.tostring(sel(htmlobj)[0])
    except:
        return ""
예제 #14
0
def getPageInfos(num):
    url = 'https://www.feixiaohao.com/exchange'
    if num > 1:
        url = url + '/list_' + str(num) + '.html'

    ret = requests.get(url)

    tree = html.fromstring(ret.text)
    trs = tree.xpath('/html/body/div[5]/div/div[1]/div[3]/table/tbody/tr')
    infos = []
    for tr in trs:
        info = {}
        tradeTypes = []
        sel = CSSSelector('td a')
        tds = sel(tr)
        for index, td in enumerate(tds):
            if td.text is not None:
                if index == 1:
                    info['h24Volume'] = td.text
                elif index == 2:
                    info['marketNum'] = td.text
                elif index == 3:
                    info['country'] = td.text

                if len(td.cssselect('a img')) > 0:
                    img = td.cssselect('a img')[0]
                    info['icon'] = img.get('src')
                    href = td.cssselect('a')[0].get('href')
                    info['code'] = href[10:len(href) - 1]

            aElm = td.cssselect('a')
            if len(aElm) > 0:

                for elm in aElm:
                    href = elm.get('href')
                    pos = href.find('type=')
                    if pos != -1:
                        tradeTypes.append(href[16:])

        info['tradeTypes'] = ','.join(tradeTypes)
        infos.append(info)

    for info in infos:
        setExInfo(info)
    return infos
def replaceUnresolvedLinks(root):
    """ This transformation replaces all a.external-link
        nodes with a proper footnote.
        Used for PDF generation only (html_mode = 'split')
    """

    for link in CSSSelector('a.external-link')(root):
        href = link.attrib['href']

        span1 = lxml.html.Element('span')
        span1.attrib['class'] = 'generated-footnote-text'
        span1.text = link.text_content()

        span2 = lxml.html.Element('span')
        span2.attrib['class'] = 'generated-footnote'
        span2.text = href
        span1.insert(1, span2)
        link.getparent().replace(link, span1)
    def get_pagination(self, server, url):

        resp = server.get(url)
        tree = lxml.html.fromstring(resp.text)
        sel = CSSSelector('#SDTOPDESTCONTENT > div.deckTools.btm > div > div > :last-child')

        pageNum = sel(tree)

        if len(pageNum) != 0:
            count_of_page = pageNum[0].get('data-page-number')
        else:
            count_of_page = 1

        page_numbers = range(0, int(count_of_page), 1)
        for page in page_numbers:
            link_number = int(page) * 20

        return link_number
예제 #17
0
    def get_submission_dates(self, arxiv_tree, queried_version):
        links = CSSSelector("div.submission-history")(arxiv_tree)[0]
        #print("links are", links)
        versions = {}
        blob = self.clean_gunky_arxiv_data(links.text_content())

        #print( "Parsing", blob)
        for line in blob.split("\n"):
            match = self.version_re.match(line)
            if match:
                version, d = match.group(1), match.group(2)
                d = datetime.datetime.strptime(d, '%a, %d %b %Y').date()
                versions[version] = d
                if queried_version == version:
                    return {version: d}
                #print(version, date)

        return versions
예제 #18
0
def _get_post_images(post_page):
    """Scrape the images from a post"""

    images = CSSSelector(".postbody img")(post_page)

    # filter out forum images that are not part of the post
    images = [
        img for img in images
        if "smilies" not in img.get("src") and "styles" not in img.get("src")
    ]

    # add base url for relative urls
    images = [
        _forum_url +
        img.get("src")[2:] if img.get("src")[0:2] == "./" else img.get("src")
        for img in images
    ]
    return images
예제 #19
0
    def _selector_query_found(self, bodies, selector):
        selector = selector.split(':')[0]

        if '}' in selector:
            # XXX does this ever happen any more?
            return

        for body in bodies:
            try:
                for _ in CSSSelector(selector)(body):
                    return True
            except SelectorSyntaxError:
                print('TROUBLEMAKER', file=sys.stderr)
                print(repr(selector), file=sys.stderr)
            except ExpressionError:
                print('EXPRESSIONERROR', file=sys.stderr)
                print(repr(selector), file=sys.stderr)
        return False
예제 #20
0
def book_keywords(obj):
    html = obj.getText()
    if not html:
        return []

    if isinstance(html, str):
        html = html.decode('utf-8')

    doc = lxml.html.parse(StringIO(html))
    nodes = doc.xpath(CSSSelector('span.keyword').path)

    keywords = []
    for node in nodes:
        if not node.attrib.get('title'):
            continue
        keywords.append(node.attrib['title'].encode('utf-8').strip())

    return keywords
예제 #21
0
    def get_balance(self):
        self.open_with_retry(self.LOGIN_URL, u'login')

        logging.debug(u'Filling login form')
        self.browser.select_form(nr=0)
        self.browser[u'username'] = CONFIG[u'odesk'][u'username']
        self.browser[u'password'] = CONFIG[u'odesk'][u'password']

        logging.debug(u'Submitting login form')
        self.browser.submit()

        page = self.open_with_retry(self.ACCOUNT_URL, u'account')

        logging.debug(u'Parsing page and extracting balance')
        html = etree.HTML(page)
        raw_balance = CSSSelector(u'div.oMain div.oTxtMega span.oPos')(html)[0].text

        return Currency(self.string_to_decimal(raw_balance), u'USD')
예제 #22
0
def extract_jokes():
    result = []

    url = 'https://bestlifeonline.com/bad-funny-puns/'
    response = requests.get(url)

    sel = CSSSelector('.content li')

    html_elmnts = html.fromstring(response.content)

    for joke in sel(html_elmnts):
        result.append({
            'joke': joke.text_content().strip(),
            'score': None,
            'categories': ['Pun']
        })

    return result
예제 #23
0
def extract_jokes(page):
    result = []

    url = 'https://www.rd.com/jokes/puns/page/{}/'
    response = requests.get(url.format(page))

    sel = CSSSelector('.excerpt-wrapper')

    html_elmnts = html.fromstring(response.content)

    for joke in sel(html_elmnts):
        result.append({
            'joke': joke.text_content().strip(),
            'score': None,
            'categories': ['Pun']
        })

    return result
예제 #24
0
def convert_iframes(tree, loader_context):
    """Convert iframes to divs with links to its src.

    convert_iframes() is called after remove_elems() so that unwanted iframes can be
    eliminated first.
    """
    base_url = loader_context.get("base_url", None) if loader_context else None
    selector = CSSSelector("iframe")
    for elem in selector(tree):
        if "src" not in elem.attrib:
            continue
        url = urljoin(base_url, elem.attrib.pop("src"))
        elem_new = lxml.html.fragment_fromstring(
            '<div><a href="{url}">{url}</a></div>'.format(url=url))
        elem_new.tail = elem.tail
        elem.getparent().replace(elem, elem_new)

    return [tree]
예제 #25
0
def extract_jokes():
    result = []

    url = 'https://www.sunnyskyz.com/funny-jokes/221/58-Funny-Puns-You-Can-t-Wait-To-Use'
    response = requests.get(url)

    sel = CSSSelector('#picofday p')

    html_elmnts = html.fromstring(response.content)

    for joke in sel(html_elmnts):
        result.append({
            'joke': joke.text_content().strip(),
            'score': None,
            'categories': ['Pun']
        })

    return result
예제 #26
0
def url_selector_values(url, selector, attr):
    """Provide attribute list of type `attr` from `selector` match at `url`.

    If an attribute is not present for a matched element, `None` is added to the
    returned list.

    Common use:
    >>> url_selector_values(my_url, 'span a', 'href')
    ['a.html', 'b.html', 'c.html']
    >>> url_selector_values(my_url, 'span a', 'rel')
    [None, None, 'external']
    >>> url_selector_values(my_url, 'span a', 'foo')
    [None, None, None]
    >>>
    """
    tree = etree.parse(urllib2.urlopen(url), parser)
    cs = CSSSelector(selector)
    return [branch.get(attr) for branch in cs(tree)]
def init():
    global meta_selectors
    global prop2selectors
    global sel2value_attr
    global bad_chars
    global website_pattern
    for prop, sels in meta_selectors.iteritems():
        objs = []
        for sel in sels:
            sel_obj = CSSSelector(sel)
            objs.append(sel_obj)
            if 'link' in sel:
                sel2value_attr[sel_obj] = 'href'
            elif 'meta' in sel:
                sel2value_attr[sel_obj] = 'content'
            else:
                sel2value_attr[sel_obj] = None
        prop2selectors[prop] = objs
예제 #28
0
def get_week_names():
    all_weeks = dict()
    for year in range(1922, 2021):
        print(year)
        url = f'{base_url}/years/{year}/week_1.htm'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        sel = CSSSelector('a')
        h = html.fromstring(response.text)
        weeks = {
            f'{year} Week {r["href"].split("_")[1].replace(".htm", "")}':
            f'{year} {r.text}'
            for r in soup.find_all('a', href=True)
            if f'/years/{year}/week' in r['href']
        }
        all_weeks = {**all_weeks, **weeks}
    with open('data/weeks.json', 'w') as f:
        f.writelines(json.dumps(all_weeks))
def scrape_and_look_for_next_link(url):
    html = scraperwiki.scrape(url)
    html = re.sub('\<a(?:.*?)resultsSectionLabel(?:.*?)/a\>','',html)
#    print html
    root = lxml.html.fromstring(html)
    sel = CSSSelector('div.cat a')
    links = [a.get('href') for a in sel(root)]
    print links
#    print links
    scrape_links(links)
    scraperwiki.sqlite.save_var('last_link', links[0])
    scraperwiki.sqlite.save_var('dclastlink', links[0])
    next_link = root.cssselect("a.next")
#    print next_link
    if next_link:
        next_url = urlparse.urljoin(base_url, next_link[0].attrib.get('href'))
#        print next_url
        scrape_and_look_for_next_link(next_url)
예제 #30
0
def find_language_rows(content: str, language: str):
    html_content = fromstring(content)
    language_selector = CSSSelector(f"span.{LANGUAGE_CSS_CLASS}")
    language_cells = language_selector(html_content)

    rows = []
    for cell in language_cells:
        language_text = cell.text_content().lower()
        if "/" not in language_text:
            if language == language_text:
                row = cell.getparent().getparent()
                rows.append(row)
        else:
            if language in language_text:
                row = cell.getparent().getparent()
                rows.append(row)

    return rows