def _merge_infobox(self, infobox): add_infobox = True infobox_id = infobox.get('id', None) if infobox_id is not None: for existingIndex in self.infoboxes: if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)): merge_two_infoboxes(existingIndex, infobox) add_infobox = False if add_infobox: self.infoboxes.append(infobox)
def _merge_result(self, result, position): if result.__contains__('showurl'): result['parsed_url'] = urlparse(result['showurl']) else: result['parsed_url'] = urlparse(result['url']) # if the result has no scheme, use http as default if not result['parsed_url'].scheme: result['parsed_url'] = result['parsed_url']._replace(scheme="http") result['url'] = urlparse(result['url']).geturl() result['engines'] = set([result['engine']]) # strip multiple spaces and cariage returns from content if result.get('content'): result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) # check for duplicates duplicated = False for merged_result in self._merged_results: if compare_urls(result['parsed_url'], merged_result['parsed_url'])\ and result.get('template') == merged_result.get('template'): duplicated = merged_result break # merge duplicates together if duplicated: # using content with more text if result_content_len(result.get('content', '')) >\ result_content_len(duplicated.get('content', '')): duplicated['content'] = result['content'] # merge all result's parameters not found in duplicate for key in result.keys(): if not duplicated.get(key): duplicated[key] = result.get(key) # add the new position duplicated['positions'].append(position) # add engine to list of result-engines duplicated['engines'].add(result['engine']) # using https if possible if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https': duplicated['url'] = result['parsed_url'].geturl() duplicated['parsed_url'] = result['parsed_url'] # if there is no duplicate found, append result else: result['positions'] = [position] with RLock(): self._merged_results.append(result)
def _merge_infobox(self, infobox): add_infobox = True infobox_id = infobox.get('id', None) infobox['engines'] = set([infobox['engine']]) if infobox_id is not None: parsed_url_infobox_id = urlparse(infobox_id) for existingIndex in self.infoboxes: if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id): merge_two_infoboxes(existingIndex, infobox) add_infobox = False if add_infobox: self.infoboxes.append(infobox)
def response(resp): dom = html.fromstring(resp.text) results = [] for element in dom.xpath('//div[@id="search"] //td'): link = element.xpath('./a')[0] google_url = urlparse(link.xpath('.//@href')[0]) query = parse_qs(google_url.query) source_url = next(iter(query.get('q', [])), None) title_parts = element.xpath('./cite//following-sibling::*/text()') title_parts.extend(element.xpath('./cite//following-sibling::text()')[:-1]) result = { 'title': ''.join(title_parts), 'content': '', 'template': 'images.html', 'url': source_url, 'img_src': source_url, 'thumbnail_src': next(iter(link.xpath('.//img //@src')), None) } if not source_url or not result['thumbnail_src']: continue results.append(result) return results
def response(resp): dom = html.fromstring(resp.text) results = [] for element in dom.xpath('//div[@id="search"] //td'): link = element.xpath('./a')[0] google_url = urlparse(link.xpath('.//@href')[0]) query = parse_qs(google_url.query) source_url = next(iter(query.get('q', [])), None) title_parts = element.xpath('./cite//following-sibling::*/text()') title_parts.extend( element.xpath('./cite//following-sibling::text()')[:-1]) result = { 'title': ''.join(title_parts), 'content': '', 'template': 'images.html', 'url': source_url, 'img_src': source_url, 'thumbnail_src': next(iter(link.xpath('.//img //@src')), None) } if not source_url or not result['thumbnail_src']: continue results.append(result) return results
def clean_url(url): parsed = urlparse(url) query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] return urlunparse( (parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def on_result(request, search, result): doi = extract_doi(result['parsed_url']) if doi and len(doi) < 50: for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'): if doi.endswith(suffix): doi = doi[:-len(suffix)] result['url'] = get_doi_resolver(request.args, request.preferences.get_value('doi_resolver')) + doi result['parsed_url'] = urlparse(result['url']) return True
def on_result(request, search, result): doi = extract_doi(result['parsed_url']) if doi and len(doi) < 50: for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'): if doi.endswith(suffix): doi = doi[:-len(suffix)] result['url'] = 'http://doai.io/' + doi result['parsed_url'] = urlparse(result['url']) return True
def clean_url(url): parsed = urlparse(url) query = [(k, v) for (k, v) in parse_qsl(parsed.query) if k not in ['ixid', 's']] return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, urlencode(query), parsed.fragment))
def parse_url(url_string, google_hostname): # sanity check if url_string is None: return url_string # normal case parsed_url = urlparse(url_string) if (parsed_url.netloc in [google_hostname, ''] and parsed_url.path == redirect_path): query = dict(parse_qsl(parsed_url.query)) return query['q'] else: return url_string
def https_url_rewrite(result): skip_https_rewrite = False # check if HTTPS rewrite is possible for target, rules, exclusions in https_rules: # check if target regex match with url if target.match(result['parsed_url'].netloc): # process exclusions for exclusion in exclusions: # check if exclusion match with url if exclusion.match(result['url']): skip_https_rewrite = True break # skip https rewrite if required if skip_https_rewrite: break # process rules for rule in rules: try: new_result_url = rule[0].sub(rule[1], result['url']) except: break # parse new url new_parsed_url = urlparse(new_result_url) # continiue if nothing was rewritten if result['url'] == new_result_url: continue # get domainname from result # TODO, does only work correct with TLD's like # asdf.com, not for asdf.com.de # TODO, using publicsuffix instead of this rewrite rule old_result_domainname = '.'.join( result['parsed_url'].hostname.split('.')[-2:]) new_result_domainname = '.'.join( new_parsed_url.hostname.split('.')[-2:]) # check if rewritten hostname is the same, # to protect against wrong or malicious rewrite rules if old_result_domainname == new_result_domainname: # set new url result['url'] = new_result_url # target has matched, do not search over the other rules break return result
def extract_url(xpath_results, search_url): if xpath_results == []: raise Exception('Empty url resultset') url = extract_text(xpath_results) if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) url = u'{0}:{1}'.format(parsed_search_url.scheme, url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) # normalize url url = normalize_url(url) return url
def extract_url(xpath_results, search_url): if xpath_results == []: raise Exception('Empty url resultset') url = extract_text(xpath_results) if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(search_url) url = u'{0}:{1}'.format(parsed_search_url.scheme or 'http', url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(search_url, url) # normalize url url = normalize_url(url) return url
def normalize_url(url): parsed_url = urlparse(url) # add a / at this end of the url if there is no path if not parsed_url.netloc: raise Exception('Cannot parse url') if not parsed_url.path: url += '/' # FIXME : hack for yahoo if parsed_url.hostname == 'search.yahoo.com'\ and parsed_url.path.startswith('/r'): p = parsed_url.path mark = p.find('/**') if mark != -1: return unquote(p[mark + 3:]).decode('utf-8') return url
def fetch_firefox_versions(): resp = requests.get(URL, timeout=2.0) if resp.status_code != 200: raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code) else: dom = html.fromstring(resp.text) versions = [] for link in dom.xpath('//a/@href'): url = urlparse(urljoin(URL, link)) path = url.path if path.startswith(RELEASE_PATH): version = path[len(RELEASE_PATH):-1] if NORMAL_REGEX.match(version): versions.append(LooseVersion(version)) list.sort(versions, reverse=True) return versions
def response(resp): img_results = [] text_results = [] search_results = json.loads(resp.text) # return empty array if there are no results if 'data' not in search_results: return [] posts = search_results.get('data', {}).get('children', []) # process results for post in posts: data = post['data'] # extract post information params = { 'url': urljoin(base_url, data['permalink']), 'title': data['title'] } # if thumbnail field contains a valid URL, we need to change template thumbnail = data['thumbnail'] url_info = urlparse(thumbnail) # netloc & path if url_info[1] != '' and url_info[2] != '': params['img_src'] = data['url'] params['thumbnail_src'] = thumbnail params['template'] = 'images.html' img_results.append(params) else: created = datetime.fromtimestamp(data['created_utc']) content = data['selftext'] if len(content) > 500: content = content[:500] + '...' params['content'] = content params['publishedDate'] = created text_results.append(params) # show images first and text results second return img_results + text_results
def url_proxy(): """get real url for baidu sogou and 360sousuo""" url = request.args.get('proxyurl') token = request.args.get('token') if token != new_hmac(settings['result_proxy']['key'], url.encode('utf-8')): return render('404.html'), 404 if "www.baidu.com/link?url" in url: try: resp = requests.head(url, timeout=1) except requests.exceptions.Timeout: return redirect(url) if resp.status_code == 200: realurl = resp.url else: realurl = url return redirect(realurl) else: try: resp = requests.get(url, timeout=1) except requests.exceptions.Timeout: return redirect(url) if resp.status_code == 200: if "http:" not in resp.text and "https:" not in resp.text: # try to fix response with host in window.location.replace function resp_content = resp.text.strip() count = resp_content.index("window.location.replace(") str_content = list(resp_content) # 25 is len("window.location.replace(")+1 str_content.insert(count + 25, "https://" + urlparse(url)[1]) resp_content = "".join(str_content) return resp_content else: # to get url from html response return resp.content else: return redirect(url)
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for img in dom.xpath('//a'): r = { 'title': u' '.join(img.xpath('.//div[class="rg_ilmbg"]//text()')), 'content': '', 'template': 'images.html', } url = urlparse(img.xpath('.//@href')[0]) query = parse_qs(url.query) r['url'] = query['imgrefurl'][0] r['img_src'] = query['imgurl'][0] r['thumbnail_src'] = r['img_src'] # append result results.append(r) # return results return results
def __merge_url_result(self, result, position): result['parsed_url'] = urlparse(result['url']) # if the result has no scheme, use http as default if not result['parsed_url'].scheme: result['parsed_url'] = result['parsed_url']._replace(scheme="http") result['url'] = result['parsed_url'].geturl() result['engines'] = set([result['engine']]) # strip multiple spaces and cariage returns from content if result.get('content'): result['content'] = WHITESPACE_REGEX.sub(' ', result['content']) duplicated = self.__find_duplicated_http_result(result) if duplicated: self.__merge_duplicated_http_result(duplicated, result, position) return # if there is no duplicate found, append result result['positions'] = [position] with RLock(): self._merged_results.append(result)
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the ling to the origin PNG, JPG or whatever is given # (we do not blow out the link there, you could still implement that) # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, '@alt')[0] img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath(img_node, '../../../a[2]')[0] url = eval_xpath(link_node, '@href')[0] pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[contains(@id, "ires")]//div[contains(@class, "hp-xpdbox")]') if instant_answer: answer_re = r'(?P<prefix><a\s+href=")\/url\?q=(?P<url>[^"]+?)\&\;[^"]*(?P<suffix>"\s*>)' answer_subst = "\\g<prefix>\\g<url>\\g<suffix>" answer_html = ['<br>'] for element in instant_answer: answer_html.append(etree.tostring(element, method="html")) answer_str = u' '.join(answer_html) answer_fixed = re.sub(answer_re, answer_subst, answer_str, 0, re.MULTILINE) results.append({'answer': answer_fixed}) try: results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] .split()[1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath(dom, '//div[@id="result-stats"]//text()')[0] _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title = extract_text(eval_xpath(result, title_xpath)[0]) url = eval_xpath(result, href_xpath)[0] content = extract_text_from_dom(result, content_xpath) results.append({'url': url, 'title': title, 'content': content}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def image_url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th': query = dict(parse_qsl(parsed_url.query)) return "https://www.bing.com/th?id=" + query.get('id') return url_string
def url_cleanup(url_string): parsed_url = urlparse(url_string) if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx': query = dict(parse_qsl(parsed_url.query)) return query.get('url', None) return url_string
def extract_domain(url): return urlparse(url)[1]
def merge_two_infoboxes(infobox1, infobox2): # get engines weights if hasattr(engines[infobox1['engine']], 'weight'): weight1 = engines[infobox1['engine']].weight else: weight1 = 1 if hasattr(engines[infobox2['engine']], 'weight'): weight2 = engines[infobox2['engine']].weight else: weight2 = 1 if weight2 > weight1: infobox1['engine'] = infobox2['engine'] if 'urls' in infobox2: urls1 = infobox1.get('urls', None) if urls1 is None: urls1 = [] for url2 in infobox2.get('urls', []): unique_url = True for url1 in infobox1.get('urls', []): if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))): unique_url = False break if unique_url: urls1.append(url2) infobox1['urls'] = urls1 if 'img_src' in infobox2: img1 = infobox1.get('img_src', None) img2 = infobox2.get('img_src') if img1 is None: infobox1['img_src'] = img2 elif weight2 > weight1: infobox1['img_src'] = img2 if 'attributes' in infobox2: attributes1 = infobox1.get('attributes', None) if attributes1 is None: attributes1 = [] infobox1['attributes'] = attributes1 attributeSet = set() for attribute in infobox1.get('attributes', []): if attribute.get('label', None) not in attributeSet: attributeSet.add(attribute.get('label', None)) for attribute in infobox2.get('attributes', []): if attribute.get('label', None) not in attributeSet: attributes1.append(attribute) if 'content' in infobox2: content1 = infobox1.get('content', None) content2 = infobox2.get('content', '') if content1 is not None: if result_content_len(content2) > result_content_len(content1): infobox1['content'] = content2 else: infobox1['content'] = content2
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int(dom.xpath('//div[@id="resultStats"]//text()')[0] .split()[1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int( dom.xpath('//div[@id="resultStats"]//text()')[0].split() [1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({ 'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results