Python extract_text примеры, searx.engines.xpath.extract_text Python примеры использования

Пример #1

0

Показать файл

Файл: bing_videos.py Проект: MrLpk/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath('//div[@class="dg_u"]'):

        # try to extract the url
        url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload')
        if len(url_container) > 0:
            url = loads(url_container[0])['purl']
        else:
            url = result.xpath('./a/@href')[0]

            # discard results that do not return an external url
            # very recent results sometimes don't return the video's url
            if url.startswith('/videos/search?'):
                continue

        title = extract_text(result.xpath('./a//div[@class="tl"]'))
        content = extract_text(result.xpath('.//div[@class="pubInfo"]'))
        thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0]

        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'thumbnail': thumbnail,
                        'template': 'videos.html'})

        # first page ignores requested number of results
        if len(results) >= number_of_results:
            break

    return results

Пример #2

0

Показать файл

Файл: ina.py Проект: JASON0916/searx

def response(resp):
    results = []

    # we get html in a JSON container...
    response = loads(resp.text)
    if "content" not in response:
        return []
    dom = html.fromstring(response["content"])
    p = HTMLParser()

    # parse results
    for result in dom.xpath(results_xpath):
        videoid = result.xpath(url_xpath)[0]
        url = base_url + videoid
        title = p.unescape(extract_text(result.xpath(title_xpath)))
        thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
        if thumbnail[0] == '/':
            thumbnail = base_url + thumbnail
        d = extract_text(result.xpath(publishedDate_xpath)[0])
        d = d.split('/')
        # force ISO date to avoid wrong parsing
        d = "%s-%s-%s" % (d[2], d[1], d[0])
        publishedDate = parser.parse(d)
        content = extract_text(result.xpath(content_xpath))

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'template': 'videos.html',
                        'publishedDate': publishedDate,
                        'thumbnail': thumbnail})

    # return results
    return results

Пример #3

0

Показать файл

Файл: framalibre.py Проект: MrLpk/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        link = result.xpath(link_xpath)[0]
        href = urljoin(base_url, link.attrib.get('href'))
        # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
        title = escape(extract_text(link))
        thumbnail_tags = result.xpath(thumbnail_xpath)
        thumbnail = None
        if len(thumbnail_tags) > 0:
            thumbnail = extract_text(thumbnail_tags[0])
            if thumbnail[0] == '/':
                thumbnail = base_url + thumbnail
        content = escape(extract_text(result.xpath(content_xpath)))

        # append result
        results.append({'url': href,
                        'title': title,
                        'img_src': thumbnail,
                        'content': content})

    # return results
    return results

Пример #4

0

Показать файл

Файл: vimeo.py Проект: traceur/FrozenSearch

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    p = HTMLParser()

    # parse results
    for result in dom.xpath(results_xpath):
        url = base_url + result.xpath(url_xpath)[0]
        title = p.unescape(extract_text(result.xpath(title_xpath)))
        thumbnail = extract_text(result.xpath(content_xpath)[0])
        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))

        # append result
        results.append(
            {
                "url": url,
                "title": title,
                "content": "",
                "template": "videos.html",
                "publishedDate": publishedDate,
                "thumbnail": thumbnail,
            }
        )

    # return results
    return results

Пример #5

0

Показать файл

Файл: startpage.py Проект: Acidburn0zzz/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    # parse results
    for result in dom.xpath(results_xpath):
        links = result.xpath(link_xpath)
        if not links:
            continue
        link = links[0]
        url = link.attrib.get('href')

        # block google-ad url's
        if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
            continue

        title = escape(extract_text(link))

        if result.xpath('./p[@class="desc"]'):
            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
        else:
            content = ''

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    # return results
    return results

Пример #6

0

Показать файл

Файл: google.py Проект: Reventl0v/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        title = extract_text(result.xpath(title_xpath)[0])
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            parsed_url = urlparse(url)
            if parsed_url.netloc==google_hostname and parsed_url.path==search_path:
                # remove the link to google news
                continue

            if parsed_url.netloc==google_hostname and parsed_url.path==images_path:
                # images result
                results = results + parse_images(result)
            else:
                # normal result
                content = extract_text(result.xpath(content_xpath)[0])
                # append result
                results.append({'url': url, 
                                'title': title, 
                                'content': content})
        except:
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

Пример #7

0

Показать файл

Файл: bing.py Проект: Acidburn0zzz/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    # parse results
    for result in dom.xpath('//div[@class="sa_cc"]'):
        link = result.xpath('.//h3/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        content = escape(extract_text(result.xpath('.//p')))

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    # return results if something is found
    if results:
        return results

    # parse results again if nothing is found yet
    for result in dom.xpath('//li[@class="b_algo"]'):
        link = result.xpath('.//h2/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        content = escape(extract_text(result.xpath('.//p')))

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content})

    # return results
    return results

Пример #8

0

Показать файл

Файл: vimeo.py Проект: 3615pipou/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)
    p = HTMLParser()

    # parse results
    for result in dom.xpath(results_xpath):
        videoid = result.xpath(url_xpath)[0]
        url = base_url + videoid
        title = p.unescape(extract_text(result.xpath(title_xpath)))
        thumbnail = extract_text(result.xpath(thumbnail_xpath)[0])
        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
        embedded = embedded_url.format(videoid=videoid)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': '',
                        'template': 'videos.html',
                        'publishedDate': publishedDate,
                        'embedded': embedded,
                        'thumbnail': thumbnail})

    # return results
    return results

Пример #9

0

Показать файл

Файл: bing.py Проект: kvch/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append(
            {"number_of_results": int(dom.xpath('//span[@class="sb_count"]/text()')[0].split()[0].replace(",", ""))}
        )
    except:
        pass

    # parse results
    for result in dom.xpath('//div[@class="sa_cc"]'):
        link = result.xpath(".//h3/a")[0]
        url = link.attrib.get("href")
        title = extract_text(link)
        content = extract_text(result.xpath(".//p"))

        # append result
        results.append({"url": url, "title": title, "content": content})

    # parse results again if nothing is found yet
    for result in dom.xpath('//li[@class="b_algo"]'):
        link = result.xpath(".//h2/a")[0]
        url = link.attrib.get("href")
        title = extract_text(link)
        content = extract_text(result.xpath(".//p"))

        # append result
        results.append({"url": url, "title": title, "content": content})

    # return results
    return results

Пример #10

0

Показать файл

Файл: twitter.py Проект: davidar/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for tweet in dom.xpath(results_xpath):
        try:
            link = tweet.xpath(link_xpath)[0]
            content = extract_text(tweet.xpath(content_xpath)[0])
        except Exception:
            continue

        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(tweet.xpath(title_xpath))

        pubdate = tweet.xpath(timestamp_xpath)
        if len(pubdate) > 0:
            timestamp = float(pubdate[0].attrib.get('data-time'))
            publishedDate = datetime.fromtimestamp(timestamp, None)
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content,
                            'publishedDate': publishedDate})
        else:
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content})

    # return results
    return results

Пример #11

0

Показать файл

Файл: google_videos.py Проект: asciimoo/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('//div[@class="g"]'):

        title = extract_text(result.xpath('.//h3'))
        url = result.xpath('.//div[@class="r"]/a/@href')[0]
        content = extract_text(result.xpath('.//span[@class="st"]'))

        # get thumbnails
        script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text)
        id = result.xpath('.//div[@class="s"]//img/@id')[0]
        thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id,
                                     script)
        tmp = []
        if len(thumbnails_data) != 0:
            tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0])
        thumbnail = ''
        if len(tmp) != 0:
            thumbnail = tmp[-1]

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'thumbnail': thumbnail,
                        'template': 'videos.html'})

    return results

Пример #12

0

Показать файл

Файл: yahoo.py Проект: Reventl0v/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            url = parse_url(extract_url(result.xpath(url_xpath), search_url))
            title = extract_text(result.xpath(title_xpath)[0])
        except:
            continue

        content = extract_text(result.xpath(content_xpath)[0])

        # append result
        results.append({'url': url, 
                        'title': title, 
                        'content': content})

    # if no suggestion found, return results
    if not suggestion_xpath:
        return results

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    # return results
    return results

Пример #13

0

Показать файл

Файл: digbt.py Проект: cyrilix/searx

def response(resp):
    dom = html.fromstring(resp.content)
    search_res = dom.xpath('.//td[@class="x-item"]')

    if not search_res:
        return list()

    results = list()
    for result in search_res:
        url = urljoin(URL, result.xpath('.//a[@title]/@href')[0])
        title = result.xpath('.//a[@title]/text()')[0]
        content = extract_text(result.xpath('.//div[@class="files"]'))
        files_data = extract_text(result.xpath('.//div[@class="tail"]')).split()
        filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER])
        magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0]

        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'filesize': filesize,
                        'magnetlink': magnetlink,
                        'seed': 'N/A',
                        'leech': 'N/A',
                        'template': 'torrent.html'})

    return results

Пример #14

0

Показать файл

Файл: duckduckgo.py Проект: harry-wood/searx

def response(resp):
    results = []

    doc = fromstring(resp.text)

    # parse results
    for r in doc.xpath(result_xpath):
        try:
            res_url = r.xpath(url_xpath)[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(r.xpath(title_xpath))
        content = extract_text(r.xpath(content_xpath))

        # append result
        results.append({'title': title,
                        'content': content,
                        'url': res_url})

    # return results
    return results

Пример #15

0

Показать файл

Файл: www500px.py Проект: 3615pipou/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)
    regex = re.compile('3\.jpg.*$')

    # parse results
    for result in dom.xpath('//div[@class="photo"]'):
        link = result.xpath('.//a')[0]
        url = urljoin(base_url, link.attrib.get('href'))
        title = extract_text(result.xpath('.//div[@class="title"]'))
        thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
        # To have a bigger thumbnail, uncomment the next line
        # thumbnail_src = regex.sub('4.jpg', thumbnail_src)
        content = extract_text(result.xpath('.//div[@class="info"]'))
        img_src = regex.sub('2048.jpg', thumbnail_src)

        # append result
        results.append({'url': url,
                        'title': title,
                        'img_src': img_src,
                        'content': content,
                        'thumbnail_src': thumbnail_src,
                        'template': 'images.html'})

    # return results
    return results

Пример #16

0

Показать файл

Файл: youtube_noapi.py Проект: Acidburn0zzz/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        videoid = result.xpath('@data-context-item-id')[0]

        url = base_youtube_url + videoid
        thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'

        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        embedded = embedded_url.format(videoid=videoid)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'template': 'videos.html',
                        'embedded': embedded,
                        'thumbnail': thumbnail})

    # return results
    return results

Пример #17

0

Показать файл

Файл: btdigg.py Проект: MrLpk/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//div[@id="search_res"]/table/tr')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res:
        link = result.xpath('.//td[@class="torrent_name"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)
        content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
        content = "<br />".join(content.split("\n"))

        filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]
        filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1]
        files = result.xpath('.//span[@class="attr_val"]/text()')[1]
        seed = result.xpath('.//span[@class="attr_val"]/text()')[2]

        # convert seed to int if possible
        if seed.isdigit():
            seed = int(seed)
        else:
            seed = 0

        leech = 0

        # convert filesize to byte if possible
        filesize = get_torrent_size(filesize, filesize_multiplier)

        # convert files to int if possible
        if files.isdigit():
            files = int(files)
        else:
            files = None

        magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href']

        # append result
        results.append({'url': href,
                        'title': title,
                        'content': content,
                        'seed': seed,
                        'leech': leech,
                        'filesize': filesize,
                        'files': files,
                        'magnetlink': magnetlink,
                        'template': 'torrent.html'})

    # return results sorted by seeder
    return sorted(results, key=itemgetter('seed'), reverse=True)

Пример #18

0

Показать файл

Файл: google.py Проект: kvch/searx

def parse_images(result, google_hostname):
    results = []
    for image in result.xpath(images_xpath):
        url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname)
        img_src = extract_text(image.xpath(image_img_src_xpath)[0])

        # append result
        results.append({"url": url, "title": "", "content": "", "img_src": img_src, "template": "images.html"})

    return results

Пример #19

0

Показать файл

Файл: piratebay.py Проект: GreenLunar/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//table[@id="searchResult"]//tr')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res[1:]:
        link = result.xpath('.//div[@class="detName"]//a')[0]
        href = urljoin(url, link.attrib.get("href"))
        title = extract_text(link)
        content = escape(extract_text(result.xpath(content_xpath)))
        seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]

        # convert seed to int if possible
        if seed.isdigit():
            seed = int(seed)
        else:
            seed = 0

        # convert leech to int if possible
        if leech.isdigit():
            leech = int(leech)
        else:
            leech = 0

        magnetlink = result.xpath(magnet_xpath)[0]
        torrentfile_links = result.xpath(torrent_xpath)
        if torrentfile_links:
            torrentfile_link = torrentfile_links[0].attrib.get("href")
        else:
            torrentfile_link = None

        # append result
        results.append(
            {
                "url": href,
                "title": title,
                "content": content,
                "seed": seed,
                "leech": leech,
                "magnetlink": magnetlink.attrib.get("href"),
                "torrentfile": torrentfile_link,
                "template": "torrent.html",
            }
        )

    # return results sorted by seeder
    return sorted(results, key=itemgetter("seed"), reverse=True)

Пример #20

0

Показать файл

Файл: bing_news.py Проект: Acidburn0zzz/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    # parse results
    for result in dom.xpath('//div[@class="sn_r"]'):
        link = result.xpath('.//div[@class="newstitle"]/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
        content = escape(extract_text(contentXPath))

        # parse publishedDate
        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
                                          '//div[contains(@class,"sn_ST")]'
                                          '//span[contains(@class,"sn_tm")]')

        publishedDate = escape(extract_text(publishedDateXPath))

        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
        else:
            try:
                publishedDate = parser.parse(publishedDate, dayfirst=False)
            except TypeError:
                publishedDate = datetime.now()
            except ValueError:
                publishedDate = datetime.now()

        # append result
        results.append({'url': url,
                        'title': title,
                        'publishedDate': publishedDate,
                        'content': content})

    # return results
    return results

Пример #21

0

Показать файл

Файл: google.py Проект: doriczapari/searx

def parse_images(result):
    results = []
    for image in result.xpath(images_xpath):
        url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
        img_src = extract_text(image.xpath(image_img_src_xpath)[0])

        # append result
        results.append({'url': url,
                        'title': '',
                        'content': '',
                        'img_src': img_src,
                        'template': 'images.html'})

    return results

Пример #22

0

Показать файл

Файл: piratebay.py Проект: asciimoo/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//table[@id="searchResult"]//tr')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res[1:]:
        link = result.xpath('.//div[@class="detName"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = extract_text(link)
        content = extract_text(result.xpath(content_xpath))
        seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]

        # convert seed to int if possible
        if seed.isdigit():
            seed = int(seed)
        else:
            seed = 0

        # convert leech to int if possible
        if leech.isdigit():
            leech = int(leech)
        else:
            leech = 0

        magnetlink = result.xpath(magnet_xpath)[0]
        torrentfile_links = result.xpath(torrent_xpath)
        if torrentfile_links:
            torrentfile_link = torrentfile_links[0].attrib.get('href')
        else:
            torrentfile_link = None

        # append result
        results.append({'url': href,
                        'title': title,
                        'content': content,
                        'seed': seed,
                        'leech': leech,
                        'magnetlink': magnetlink.attrib.get('href'),
                        'torrentfile': torrentfile_link,
                        'template': 'torrent.html'})

    # return results sorted by seeder
    return sorted(results, key=itemgetter('seed'), reverse=True)

Пример #23

0

Показать файл

Файл: subtitleseeker.py Проект: MrLpk/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_lang = ""

    # dirty fix for languages named differenly in their site
    if resp.search_params['language'][:2] == 'fa':
        search_lang = 'Farsi'
    elif resp.search_params['language'] == 'pt-BR':
        search_lang = 'Brazilian'
    else:
        search_lang = [lc[3]
                       for lc in language_codes
                       if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
        search_lang = search_lang[0].split(' (')[0]

    # parse results
    for result in dom.xpath(results_xpath):
        link = result.xpath(".//a")[0]
        href = link.attrib.get('href')

        if language is not "":
            href = href + language + '/'
        elif search_lang:
            href = href + search_lang + '/'

        title = extract_text(link)

        content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
        content = content + " - "
        text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0])
        content = content + text

        if result.xpath(".//span") != []:
            content = content +\
                " - (" +\
                extract_text(result.xpath(".//span")) +\
                ")"

        # append result
        results.append({'url': href,
                        'title': title,
                        'content': content})

    # return results
    return results

Пример #24

0

Показать файл

Файл: duden.py Проект: asciimoo/searx

def response(resp):
    '''post-response callback
    resp: requests response object
    '''
    results = []

    dom = html.fromstring(resp.text)

    try:
        number_of_results_string = re.sub('[^0-9]', '', dom.xpath(
            '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0]
        )

        results.append({'number_of_results': int(number_of_results_string)})

    except:
        logger.debug("Couldn't read number of results.")
        pass

    for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'):
        try:
            logger.debug("running for %s" % str(result))
            link = result.xpath('.//h2/a')[0]
            url = link.attrib.get('href')
            title = result.xpath('string(.//h2/a)')
            content = extract_text(result.xpath('.//p'))
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content})
        except:
            logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True))
            continue

    return results

Пример #25

0

Показать файл

Файл: deviantart.py Проект: Acidburn0zzz/searx

def response(resp):
    results = []

    # return empty array if a redirection code is returned
    if resp.status_code == 302:
        return []

    dom = html.fromstring(resp.text)

    regex = re.compile('\/200H\/')

    # parse results
    for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'):
        link = result.xpath('.//a[contains(@class, "thumb")]')[0]
        url = urljoin(base_url, link.attrib.get('href'))
        title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]')
        title = extract_text(title_links[0])
        thumbnail_src = link.xpath('.//img')[0].attrib.get('src')
        img_src = regex.sub('/', thumbnail_src)

        # http to https, remove domain sharding
        thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src)
        thumbnail_src = re.sub(r"http://", "https://", thumbnail_src)

        url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url)

        # append result
        results.append({'url': url,
                        'title': title,
                        'img_src': img_src,
                        'thumbnail_src': thumbnail_src,
                        'template': 'images.html'})

    # return results
    return results

Пример #26

0

Показать файл

Файл: torrentz.py Проект: MrLpk/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath('//div[@class="results"]/dl'):
        name_cell = result.xpath('./dt')[0]
        title = extract_text(name_cell)

        # skip rows that do not contain a link to a torrent
        links = name_cell.xpath('./a')
        if len(links) != 1:
            continue

        # extract url and remove a slash in the beginning
        link = links[0].attrib.get('href').lstrip('/')

        seed = 0
        leech = 0
        try:
            seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', ''))
            leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', ''))
        except:
            pass

        params = {
            'url': base_url + link,
            'title': title,
            'seed': seed,
            'leech': leech,
            'template': 'torrent.html'
        }

        # let's try to calculate the torrent size
        try:
            filesize_info = result.xpath('./dd/span[3]/text()')[0]
            filesize, filesize_multiplier = filesize_info.split()
            filesize = get_torrent_size(filesize, filesize_multiplier)

            params['filesize'] = filesize
        except:
            pass

        # does our link contain a valid SHA1 sum?
        if re.compile('[0-9a-fA-F]{40}').match(link):
            # add a magnet link to the result
            params['magnetlink'] = 'magnet:?xt=urn:btih:' + link

        # extract and convert creation date
        try:
            date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title')
            date = datetime.fromtimestamp(float(date_ts))
            params['publishedDate'] = date
        except:
            pass

        results.append(params)

    return results

Пример #27

0

Показать файл

Файл: duckduckgo_definitions.py Проект: MrLpk/searx

def result_to_text(url, text, htmlResult):
    # TODO : remove result ending with "Meaning" or "Category"
    dom = html.fromstring(htmlResult)
    a = dom.xpath('//a')
    if len(a) >= 1:
        return extract_text(a[0])
    else:
        return text

Пример #28

0

Показать файл

Файл: yahoo_news.py Проект: germc/searx

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    for result in dom.xpath(results_xpath):
        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])
        results.append({'url': url, 'title': title, 'content': content})

    if not suggestion_xpath:
        return results

    for suggestion in dom.xpath(suggestion_xpath):
        results.append({'suggestion': extract_text(suggestion)})

    return results

Пример #29

0

Показать файл

Файл: doku.py Проект: MrLpk/searx

def response(resp):
    results = []

    doc = fromstring(resp.text)

    # parse results
    # Quickhits
    for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'):
        try:
            res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
        except:
            continue

        if not res_url:
            continue

        title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))

        # append result
        results.append({'title': title,
                        'content': "",
                        'url': base_url + res_url})

    # Search results
    for r in doc.xpath('//dl[@class="search_results"]/*'):
        try:
            if r.tag == "dt":
                res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1]
                title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title'))
            elif r.tag == "dd":
                content = extract_text(r.xpath('.'))

                # append result
                results.append({'title': title,
                                'content': content,
                                'url': base_url + res_url})
        except:
            continue

        if not res_url:
            continue

    # return results
    return results

Пример #30

0

Показать файл

Файл: yahoo_news.py Проект: asciimoo/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        urls = result.xpath(url_xpath)
        if len(urls) != 1:
            continue
        url = sanitize_url(parse_url(extract_url(urls, search_url)))
        title = extract_text(result.xpath(title_xpath)[0])
        content = extract_text(result.xpath(content_xpath)[0])

        # parse publishedDate
        publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])

        # still useful ?
        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ days? ago$", publishedDate):
            publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group()))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        else:
            try:
                publishedDate = parser.parse(publishedDate)
            except:
                publishedDate = datetime.now()

        if publishedDate.year == 1900:
            publishedDate = publishedDate.replace(year=datetime.now().year)

        # append result
        results.append({'url': url,
                        'title': title,
                        'content': content,
                        'publishedDate': publishedDate})

    # return results
    return results

Пример #31

0

Показать файл

def response(resp):
    """Get response from google's search request"""
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which subdomain ?
    # subdomain = resp.search_params.get('google_subdomain')

    # convert the text to dom
    dom = html.fromstring(resp.text)
    img_bas64_map = scrap_out_thumbs(dom)
    img_src_script = eval_xpath(
        dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text

    # parse results
    #
    # root element::
    #     <div id="islmp" ..>
    # result div per image::
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
    #     The data-id matches to a item in a json-data structure in::
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
    #     In this structure the link to the origin PNG, JPG or whatever is given
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::
    #      <img class="rg_i Q4LuWd" data-iid="0"
    # second link per image-div is the target link::
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
    # the second link also contains two div tags with the *description* and *publisher*::
    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
    #      <div class="fxgdke">en.wikipedia.org</div>

    root = eval_xpath(dom, '//div[@id="islmp"]')
    if not root:
        logger.error("did not find root element id='islmp'")
        return results

    root = root[0]
    for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'):

        try:
            img_alt = eval_xpath(img_node, '@alt')[0]

            img_base64_id = eval_xpath(img_node, '@data-iid')
            if img_base64_id:
                img_base64_id = img_base64_id[0]
                thumbnail_src = img_bas64_map[img_base64_id]
            else:
                thumbnail_src = eval_xpath(img_node, '@src')
                if not thumbnail_src:
                    thumbnail_src = eval_xpath(img_node, '@data-src')
                if thumbnail_src:
                    thumbnail_src = thumbnail_src[0]
                else:
                    thumbnail_src = ''

            link_node = eval_xpath(img_node, '../../../a[2]')[0]
            url = eval_xpath(link_node, '@href')[0]

            pub_nodes = eval_xpath(link_node, './div/div')
            pub_descr = img_alt
            pub_source = ''
            if pub_nodes:
                pub_descr = extract_text(pub_nodes[0])
                pub_source = extract_text(pub_nodes[1])

            img_src_id = eval_xpath(img_node, '../../../@data-id')[0]
            src_url = scrap_img_by_id(img_src_script, img_src_id)
            if not src_url:
                src_url = thumbnail_src

            results.append({
                'url': url,
                'title': img_alt,
                'content': pub_descr,
                'source': pub_source,
                'img_src': src_url,
                # 'img_format': img_format,
                'thumbnail_src': thumbnail_src,
                'template': 'images.html'
            })
        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
            # from lxml import etree
            # logger.debug(etree.tostring(img_node, pretty_print=True))
            # import pdb
            # pdb.set_trace()
            continue

    return results

Пример #32

0

Показать файл

def response(resp):
    results = []
    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//section[@id="#torrents"]/div/table/tbody/tr')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res:
        link = result.xpath('.//a[@id="torrent_name"]')[0]
        href = link.attrib.get('href')
        title = extract_text(link)
        seed = result.xpath('.//td[8]/text()')[0]
        leech = result.xpath('.//td[9]/text()')[0]

        # convert seed to int if possible
        if seed.isdigit():
            seed = int(seed)
        else:
            seed = 0

        # convert leech to int if possible
        if leech.isdigit():
            leech = int(leech)
        else:
            leech = 0

        params = {
            'url': href,
            'title': title,
            'seed': seed,
            'leech': leech,
            'template': 'torrent.html'
        }

        # let's try to calculate the torrent size
        try:
            filesize_info = result.xpath('.//td[6]/text()')[0]
            filesize = filesize_info[:-2]
            filesize_multiplier = filesize_info[-2:].lower()
            multiplier_french_to_english = {
                'to': 'TiB',
                'go': 'GiB',
                'mo': 'MiB',
                'ko': 'KiB'
            }
            filesize = get_torrent_size(
                filesize, multiplier_french_to_english[filesize_multiplier])
            params['filesize'] = filesize
        except:
            pass

        # extract and convert creation date
        try:
            date_ts = result.xpath('.//td[5]/div/text()')[0]
            date = datetime.fromtimestamp(float(date_ts))
            params['publishedDate'] = date
        except:
            pass

        # append result
        results.append(params)

    # return results sorted by seeder
    return sorted(results, key=itemgetter('seed'), reverse=True)

Пример #33

0

Показать файл

Файл: nyaa.py Проект: bombeelu/bombeeX

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    for result in dom.xpath(xpath_results):
        # defaults
        filesize = 0
        magnet_link = ""
        torrent_link = ""

        # category in which our torrent belongs
        try:
            category = result.xpath(xpath_category)[0].attrib.get('title')
        except:
            pass

        # torrent title
        page_a = result.xpath(xpath_title)[0]
        title = extract_text(page_a)

        # link to the page
        href = base_url + page_a.attrib.get('href')

        for link in result.xpath(xpath_torrent_links):
            url = link.attrib.get('href')
            if 'magnet' in url:
                # link to the magnet
                magnet_link = url
            else:
                # link to the torrent file
                torrent_link = url

        # seed count
        seed = int_or_zero(result.xpath(xpath_seeds))

        # leech count
        leech = int_or_zero(result.xpath(xpath_leeches))

        # torrent downloads count
        downloads = int_or_zero(result.xpath(xpath_downloads))

        # let's try to calculate the torrent size
        try:
            filesize_info = result.xpath(xpath_filesize)[0]
            filesize, filesize_multiplier = filesize_info.split()
            filesize = get_torrent_size(filesize, filesize_multiplier)
        except:
            pass

        # content string contains all information not included into template
        content = 'Category: "{category}". Downloaded {downloads} times.'
        content = content.format(category=category, downloads=downloads)

        results.append({
            'url': href,
            'title': title,
            'content': content,
            'seed': seed,
            'leech': leech,
            'filesize': filesize,
            'torrentfile': torrent_link,
            'magnetlink': magnet_link,
            'template': 'torrent.html'
        })

    return results

Пример #34

0

Показать файл

Файл: tokyotoshokan.py Проект: mrphishxxx/searx

def response(resp):
    results = []

    dom = html.fromstring(resp.text)
    rows = dom.xpath(
        '//table[@class="listing"]//tr[contains(@class, "category_0")]')

    # check if there are no results or page layout was changed so we cannot parse it
    # currently there are two rows for each result, so total count must be even
    if len(rows) == 0 or len(rows) % 2 != 0:
        return []

    # regular expression for parsing torrent size strings
    size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)

    # processing the results, two rows at a time
    for i in xrange(0, len(rows), 2):
        # parse the first row
        name_row = rows[i]

        links = name_row.xpath('./td[@class="desc-top"]/a')
        params = {
            'template': 'torrent.html',
            'url': links[-1].attrib.get('href'),
            'title': extract_text(links[-1])
        }
        # I have not yet seen any torrents without magnet links, but
        # it's better to be prepared to stumble upon one some day
        if len(links) == 2:
            magnet = links[0].attrib.get('href')
            if magnet.startswith('magnet'):
                # okay, we have a valid magnet link, let's add it to the result
                params['magnetlink'] = magnet

        # no more info in the first row, start parsing the second one
        info_row = rows[i + 1]
        desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
        for item in desc.split('|'):
            item = item.strip()
            if item.startswith('Size:'):
                try:
                    # ('1.228', 'GB')
                    groups = size_re.match(item).groups()
                    multiplier = get_filesize_mul(groups[1])
                    params['filesize'] = int(multiplier * float(groups[0]))
                except Exception as e:
                    pass
            elif item.startswith('Date:'):
                try:
                    # Date: 2016-02-21 21:44 UTC
                    date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
                    params['publishedDate'] = date
                except Exception as e:
                    pass
            elif item.startswith('Comment:'):
                params['content'] = item
        stats = info_row.xpath('./td[@class="stats"]/span')
        # has the layout not changed yet?
        if len(stats) == 3:
            params['seed'] = int_or_zero(extract_text(stats[0]))
            params['leech'] = int_or_zero(extract_text(stats[1]))

        results.append(params)

    return results

Пример #35

0

Показать файл

Файл: sogou.py Проект: nibblehole/goso.ga

def response(resp):
    from searx.webapp import sentry
    results = []

    dom = html.fromstring(resp.text)

    try:
        results.append({'number_of_results': int(dom.xpath('//p[@class="num-tips"]/text()')[0]
                                                 .split(u'\u7ea6')[1].split(u'\u6761')[0].replace(',', ''))})
    except Exception:
        sentry.captureException()

    # parse results
    try:
        for result in dom.xpath('//div[@class="vrwrap"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get('href') if result.xpath('.//a')[0].attrib.get(
                    'href').startswith("http") else "https://sogou.com" + result.xpath('.//a')[0].attrib.get('href')
                # parse weixin.sogou html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath('.//div[@class="str-pd-box str-pd-none"]//a')[0].attrib.get('href')
                    title = extract_text(
                        result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a')[0])
                    content = extract_text(
                        result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]')[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                # append result
                results.append({'url': url,
                                'showurl': showurl,
                                'title': title,
                                'content': content})
            except Exception:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    try:
        for result in dom.xpath('//div[@class="rb"]'):
            try:
                url = result.xpath('.//a')[0].attrib.get('href') if result.xpath('.//a')[0].attrib.get(
                    'href').startswith("http") else "https://sogou.com" + result.xpath('.//a')[0].attrib.get('href')
                # to parse sogou weixin html
                if "http://weixin.sogou.com/" == url.strip():
                    url = result.xpath('.//div[@class="str-pd-box str-pd-none"]//a')[0].attrib.get('href')
                    title = extract_text(
                        result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a')[0])
                    content = extract_text(
                        result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]')[0])
                else:
                    title = extract_text(result.xpath('.//h3/a')[0])
                    content = extract_text(result.xpath('.//div')[0])

                if 'sogou.com/link?url' in url:
                    url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \
                        url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8"))
                    showurl = re.findall(WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0]
                    showurl = showurl.lstrip('.')
                else:
                    showurl = url

                results.append({'url': url,
                                'showurl': showurl,
                                'title': title,
                                'content': content})
            except Exception as e:
                sentry.captureException()
                continue

    except Exception as e:
        sentry.captureException()

    # return results
    return results

Пример #36

0

Показать файл

Файл: google.py Проект: gugod/searx

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url),
                            google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if ((parsed_url.netloc == google_hostname
                 and parsed_url.path.startswith(maps_path))
                    or (parsed_url.netloc.startswith(map_hostname_start))):
                x = result.xpath(map_near)
                if len(x) > 0:
                    # map : near the location
                    results = results + parse_map_near(parsed_url, x,
                                                       google_hostname)
                else:
                    # map : detail about a location
                    results = results + parse_map_detail(
                        parsed_url, result, google_hostname)

            # google news
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == search_path):
                # skipping news results
                pass

            # images result
            elif (parsed_url.netloc == google_hostname
                  and parsed_url.path == images_path):
                # only thumbnail image provided,
                # so skipping image results
                # results = results + parse_images(result, google_hostname)
                pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result,
                                                     content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': escape(extract_text(suggestion))})

    # return results
    return results

Пример #37

0

Показать файл

def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    # parse results
    for result in dom.xpath(results_xpath):
        links = result.xpath(link_xpath)
        if not links:
            continue
        link = links[0]
        url = link.attrib.get('href')

        # block google-ad url's
        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
            continue

        # block startpage search url's
        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
            continue

        # block ixquick search url's
        if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
            continue

        title = extract_text(link)

        if result.xpath('./p[@class="desc clk"]'):
            content = extract_text(result.xpath('./p[@class="desc clk"]'))
        else:
            content = ''

        published_date = None

        # check if search result starts with something like: "2 Sep 2014 ... "
        if re.match(
                r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ",
                content):
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]
            published_date = parser.parse(date_string, dayfirst=True)

            # fix content string
            content = content[date_pos:]

        # check if search result starts with something like: "5 days ago ... "
        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]

            # calculate datetime
            published_date = datetime.now() - timedelta(
                days=int(re.match(r'\d+', date_string).group()))

            # fix content string
            content = content[date_pos:]

        if published_date:
            # append result
            results.append({
                'url': url,
                'title': title,
                'content': content,
                'publishedDate': published_date
            })
        else:
            # append result
            results.append({'url': url, 'title': title, 'content': content})

    # return results
    return results

Пример #38

0

Показать файл

Файл: google.py Проект: xyz12810/searx

def extract_text_from_dom(result, xpath):
    r = result.xpath(xpath)
    if len(r) > 0:
        return extract_text(r[0])
    return None

Пример #39

0

Показать файл

Файл: google.py Проект: xyz12810/searx

def response(resp):
    results = []

    # detect google sorry
    resp_url = urlparse(resp.url)
    if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect':
        raise RuntimeWarning('sorry.google.com')

    if resp_url.path.startswith('/sorry'):
        raise RuntimeWarning(gettext('CAPTCHA required'))

    # which hostname ?
    google_hostname = resp.search_params.get('google_hostname')
    google_url = "https://" + google_hostname

    # convert the text to dom
    dom = html.fromstring(resp.text)

    instant_answer = dom.xpath('//div[@id="_vBb"]//text()')
    if instant_answer:
        results.append({'answer': u' '.join(instant_answer)})
    try:
        results_num = int(
            dom.xpath('//div[@id="resultStats"]//text()')[0].split()
            [1].replace(',', ''))
        results.append({'number_of_results': results_num})
    except:
        pass

    # parse results
    for result in dom.xpath(results_xpath):
        try:
            title = extract_text(result.xpath(title_xpath)[0])
            url = parse_url(extract_url(result.xpath(url_xpath), google_url),
                            google_hostname)
            parsed_url = urlparse(url, google_hostname)

            # map result
            if parsed_url.netloc == google_hostname:
                # TODO fix inside links
                continue
                # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start):
                #     print "yooooo"*30
                #     x = result.xpath(map_near)
                #     if len(x) > 0:
                #         # map : near the location
                #         results = results + parse_map_near(parsed_url, x, google_hostname)
                #     else:
                #         # map : detail about a location
                #         results = results + parse_map_detail(parsed_url, result, google_hostname)
                # # google news
                # elif parsed_url.path == search_path:
                #     # skipping news results
                #     pass

                # # images result
                # elif parsed_url.path == images_path:
                #     # only thumbnail image provided,
                #     # so skipping image results
                #     # results = results + parse_images(result, google_hostname)
                #     pass

            else:
                # normal result
                content = extract_text_from_dom(result, content_xpath)
                if content is None:
                    continue
                content_misc = extract_text_from_dom(result,
                                                     content_misc_xpath)
                if content_misc is not None:
                    content = content_misc + "<br />" + content
                # append result
                results.append({
                    'url': url,
                    'title': title,
                    'content': content
                })
        except:
            logger.debug('result parse error in:\n%s',
                         etree.tostring(result, pretty_print=True))
            continue

    # parse suggestion
    for suggestion in dom.xpath(suggestion_xpath):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})

    for correction in dom.xpath(spelling_suggestion_xpath):
        results.append({'correction': extract_text(correction)})

    # return results
    return results

Пример #40

0

Показать файл

def getDetail(jsonresponse, wikidata_id, language, locale):
    results = []
    urls = []
    attributes = []

    title = jsonresponse.get('parse', {}).get('displaytitle', {})
    result = jsonresponse.get('parse', {}).get('text', {})

    if not title or not result:
        return results

    title = fromstring(title)
    for elem in title.xpath(language_fallback_xpath):
        elem.getparent().remove(elem)
    title = extract_text(title.xpath(title_xpath))

    result = fromstring(result)
    for elem in result.xpath(language_fallback_xpath):
        elem.getparent().remove(elem)

    description = extract_text(result.xpath(description_xpath))

    # URLS

    # official website
    add_url(urls, result, 'P856', results=results)

    # wikipedia
    wikipedia_link_count = 0
    wikipedia_link = get_wikilink(result, language + 'wiki')
    if wikipedia_link:
        wikipedia_link_count += 1
        urls.append({'title': 'Wikipedia (' + language + ')',
                     'url': wikipedia_link})

    if language != 'en':
        wikipedia_en_link = get_wikilink(result, 'enwiki')
        if wikipedia_en_link:
            wikipedia_link_count += 1
            urls.append({'title': 'Wikipedia (en)',
                         'url': wikipedia_en_link})

    # TODO: get_wiki_firstlanguage
    # if wikipedia_link_count == 0:

    # more wikis
    add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
    add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
    add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki')

    add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo')

    # musicbrainz
    add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
    add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
    add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
    add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')

    # IMDb
    add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
    # source code repository
    add_url(urls, result, 'P1324')
    # blog
    add_url(urls, result, 'P1581')
    # social media links
    add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
    add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
    add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/')
    add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/')
    add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/')

    urls.append({'title': 'Wikidata',
                 'url': 'https://www.wikidata.org/wiki/'
                 + wikidata_id + '?uselang=' + language})

    # INFOBOX ATTRIBUTES (ROWS)

    # DATES
    # inception date
    add_attribute(attributes, result, 'P571', date=True)
    # dissolution date
    add_attribute(attributes, result, 'P576', date=True)
    # start date
    add_attribute(attributes, result, 'P580', date=True)
    # end date
    add_attribute(attributes, result, 'P582', date=True)
    # date of birth
    add_attribute(attributes, result, 'P569', date=True)
    # date of death
    add_attribute(attributes, result, 'P570', date=True)
    # date of spacecraft launch
    add_attribute(attributes, result, 'P619', date=True)
    # date of spacecraft landing
    add_attribute(attributes, result, 'P620', date=True)

    # nationality
    add_attribute(attributes, result, 'P27')
    # country of origin
    add_attribute(attributes, result, 'P495')
    # country
    add_attribute(attributes, result, 'P17')
    # headquarters
    add_attribute(attributes, result, 'Q180')

    # PLACES
    # capital
    add_attribute(attributes, result, 'P36', trim=True)
    # head of state
    add_attribute(attributes, result, 'P35', trim=True)
    # head of government
    add_attribute(attributes, result, 'P6', trim=True)
    # type of government
    add_attribute(attributes, result, 'P122')
    # official language
    add_attribute(attributes, result, 'P37')
    # population
    add_attribute(attributes, result, 'P1082', trim=True)
    # area
    add_attribute(attributes, result, 'P2046')
    # currency
    add_attribute(attributes, result, 'P38', trim=True)
    # heigth (building)
    add_attribute(attributes, result, 'P2048')

    # MEDIA
    # platform (videogames)
    add_attribute(attributes, result, 'P400')
    # author
    add_attribute(attributes, result, 'P50')
    # creator
    add_attribute(attributes, result, 'P170')
    # director
    add_attribute(attributes, result, 'P57')
    # performer
    add_attribute(attributes, result, 'P175')
    # developer
    add_attribute(attributes, result, 'P178')
    # producer
    add_attribute(attributes, result, 'P162')
    # manufacturer
    add_attribute(attributes, result, 'P176')
    # screenwriter
    add_attribute(attributes, result, 'P58')
    # production company
    add_attribute(attributes, result, 'P272')
    # record label
    add_attribute(attributes, result, 'P264')
    # publisher
    add_attribute(attributes, result, 'P123')
    # original network
    add_attribute(attributes, result, 'P449')
    # distributor
    add_attribute(attributes, result, 'P750')
    # composer
    add_attribute(attributes, result, 'P86')
    # publication date
    add_attribute(attributes, result, 'P577', date=True)
    # genre
    add_attribute(attributes, result, 'P136')
    # original language
    add_attribute(attributes, result, 'P364')
    # isbn
    add_attribute(attributes, result, 'Q33057')
    # software license
    add_attribute(attributes, result, 'P275')
    # programming language
    add_attribute(attributes, result, 'P277')
    # version
    add_attribute(attributes, result, 'P348', trim=True)
    # narrative location
    add_attribute(attributes, result, 'P840')

    # LANGUAGES
    # number of speakers
    add_attribute(attributes, result, 'P1098')
    # writing system
    add_attribute(attributes, result, 'P282')
    # regulatory body
    add_attribute(attributes, result, 'P1018')
    # language code
    add_attribute(attributes, result, 'P218')

    # OTHER
    # ceo
    add_attribute(attributes, result, 'P169', trim=True)
    # founder
    add_attribute(attributes, result, 'P112')
    # legal form (company/organization)
    add_attribute(attributes, result, 'P1454')
    # operator
    add_attribute(attributes, result, 'P137')
    # crew members (tripulation)
    add_attribute(attributes, result, 'P1029')
    # taxon
    add_attribute(attributes, result, 'P225')
    # chemical formula
    add_attribute(attributes, result, 'P274')
    # winner (sports/contests)
    add_attribute(attributes, result, 'P1346')
    # number of deaths
    add_attribute(attributes, result, 'P1120')
    # currency code
    add_attribute(attributes, result, 'P498')

    image = add_image(result)

    if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
        results.append({
                       'url': urls[0]['url'],
                       'title': title,
                       'content': description
                       })
    else:
        results.append({
                       'infobox': title,
                       'id': wikipedia_link,
                       'content': description,
                       'img_src': image,
                       'attributes': attributes,
                       'urls': urls
                       })

    return results

Пример #41

0

Показать файл

def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    search_res = dom.xpath('//div[@id="search_res"]/table/tr')

    # return empty array if nothing is found
    if not search_res:
        return []

    # parse results
    for result in search_res:
        link = result.xpath('.//td[@class="torrent_name"]//a')[0]
        href = urljoin(url, link.attrib.get('href'))
        title = escape(extract_text(link))
        content = escape(
            extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
        content = "<br />".join(content.split("\n"))

        filesize = result.xpath(
            './/span[@class="attr_val"]/text()')[0].split()[0]
        filesize_multiplier = result.xpath(
            './/span[@class="attr_val"]/text()')[0].split()[1]
        files = result.xpath('.//span[@class="attr_val"]/text()')[1]
        seed = result.xpath('.//span[@class="attr_val"]/text()')[2]

        # convert seed to int if possible
        if seed.isdigit():
            seed = int(seed)
        else:
            seed = 0

        leech = 0

        # convert filesize to byte if possible
        try:
            filesize = float(filesize)

            # convert filesize to byte
            if filesize_multiplier == 'TB':
                filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
            elif filesize_multiplier == 'GB':
                filesize = int(filesize * 1024 * 1024 * 1024)
            elif filesize_multiplier == 'MB':
                filesize = int(filesize * 1024 * 1024)
            elif filesize_multiplier == 'KB':
                filesize = int(filesize * 1024)
        except:
            filesize = None

        # convert files to int if possible
        if files.isdigit():
            files = int(files)
        else:
            files = None

        magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href']

        # append result
        results.append({
            'url': href,
            'title': title,
            'content': content,
            'seed': seed,
            'leech': leech,
            'filesize': filesize,
            'files': files,
            'magnetlink': magnetlink,
            'template': 'torrent.html'
        })

    # return results sorted by seeder
    return sorted(results, key=itemgetter('seed'), reverse=True)

Python extract_text примеры использования