Пример #1
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)
    is_onion = True if 'onions' in categories else False

    if results_xpath:
        for result in eval_xpath(dom, results_xpath):
            url = extract_url(eval_xpath(result, url_xpath), search_url)
            title = extract_text(eval_xpath(result, title_xpath))
            content = extract_text(eval_xpath(result, content_xpath))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = cached_url + extract_text(
                    result.xpath(cached_xpath))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)
    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url) for x in dom.xpath(url_xpath)),
                    map(extract_text, dom.xpath(title_xpath)),
                    map(extract_text, dom.xpath(content_xpath)),
                    map(extract_text, dom.xpath(cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url) for x in dom.xpath(url_xpath)),
                    map(extract_text, dom.xpath(title_xpath)),
                    map(extract_text, dom.xpath(content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if not suggestion_xpath:
        return results
    for suggestion in eval_xpath(dom, suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})
    return results
Пример #2
0
 def test_extract_url(self):
     def f(html_str, search_url):
         return utils.extract_url(html.fromstring(html_str), search_url)
     self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
     self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
     self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
     self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
     self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
     with self.assertRaises(lxml.etree.ParserError):
         f('', 'https://example.com')
     with self.assertRaises(Exception):
         utils.extract_url([], 'https://example.com')
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        urls = result.xpath(url_xpath)
        if len(urls) != 1:
            continue
        url = sanitize_url(parse_url(extract_url(urls, search_url)))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        # parse publishedDate
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        # still useful ?
        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                minutes=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ days? ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(
                days=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$",
                      publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        else:
            try:
                publishedDate = parser.parse(publishedDate)
            except:
                publishedDate = datetime.now()

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        # append result
        results.append({
            'url': url,
            'title': title,
            'content': content,
            'publishedDate': publishedDate
        })

    # return results
    return results
Пример #4
0
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    try:
        results_num = int(
            eval_xpath(dom,
                       '//div[@class="compPagination"]/span[last()]/text()')
            [0].split()[0].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in eval_xpath(dom, results_xpath):
        try:
            url = parse_url(
                extract_url(eval_xpath(result, url_xpath), search_url))
            title = extract_text(eval_xpath(result, title_xpath)[0])
        except:
            continue

        content = extract_text(eval_xpath(result, content_xpath)[0])

        # append result
        results.append({'url': url, 'title': title, 'content': content})

    # if no suggestion found, return results
    suggestions = eval_xpath(dom, suggestion_xpath)
    if not suggestions:
        return results

    # parse suggestion
    for suggestion in suggestions:
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results
Пример #5
0
def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in eval_xpath_list(dom, results_xpath):

        single_result = {'template': template}

        for single_field in field_definition:
            single_field = {**default_field_settings, **single_field}
            try:
                if single_field['single_element']:
                    node = eval_xpath(result, single_field['xpath'])
                else:
                    node = eval_xpath_list(result, single_field['xpath'])

                if 'extract' in single_field and single_field[
                        'extract'] == 'url':
                    value = extract_url(node, search_url)
                elif 'extract' in single_field and single_field[
                        'extract'] == 'boolean':
                    value = (isinstance(node, list) and len(node) > 0)
                elif 'extract' in single_field and single_field[
                        'extract'] == 'boolean_negate':
                    value = (isinstance(node, list) and len(node) < 1)
                else:
                    value = extract_text(node)

                single_result[single_field['field_name']] = value
            except Exception as e:
                logger.warning('error in resolving field %s:\n%s',
                               single_field['field_name'], e)
                single_result[single_field['field_name']] = unresolvable_value

        results.append(single_result)

    return results
Пример #6
0
def response(resp):
    '''Scrap *results* from the response (see :ref:`engine results`).

    '''
    results = []
    dom = html.fromstring(resp.text)
    is_onion = 'onions' in categories  # pylint: disable=undefined-variable

    if results_xpath:
        for result in eval_xpath_list(dom, results_xpath):

            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1),
                              search_url)
            title = extract_text(
                eval_xpath_list(result, title_xpath, min_len=1))
            content = extract_text(
                eval_xpath_list(result, content_xpath, min_len=1))
            tmp_result = {'url': url, 'title': title, 'content': content}

            # add thumbnail if available
            if thumbnail_xpath:
                thumbnail_xpath_result = eval_xpath_list(
                    result, thumbnail_xpath)
                if len(thumbnail_xpath_result) > 0:
                    tmp_result['img_src'] = extract_url(
                        thumbnail_xpath_result, search_url)

            # add alternative cached url if available
            if cached_xpath:
                tmp_result['cached_url'] = (cached_url + extract_text(
                    eval_xpath_list(result, cached_xpath, min_len=1)))

            if is_onion:
                tmp_result['is_onion'] = True

            results.append(tmp_result)

    else:
        if cached_xpath:
            for url, title, content, cached in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath)),
                    map(extract_text, eval_xpath_list(dom, cached_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'cached_url': cached_url + cached,
                    'is_onion': is_onion
                })
        else:
            for url, title, content in zip(
                (extract_url(x, search_url)
                 for x in eval_xpath_list(dom, url_xpath)),
                    map(extract_text, eval_xpath_list(dom, title_xpath)),
                    map(extract_text, eval_xpath_list(dom, content_xpath))):
                results.append({
                    'url': url,
                    'title': title,
                    'content': content,
                    'is_onion': is_onion
                })

    if suggestion_xpath:
        for suggestion in eval_xpath(dom, suggestion_xpath):
            results.append({'suggestion': extract_text(suggestion)})

    logger.debug("found %s results", len(results))
    return results
Пример #7
0
 def f(html_str, search_url):
     return utils.extract_url(html.fromstring(html_str), search_url)