def response(resp): results = [] json_regex = regex_json.search(resp.text) # check if results are returned if not json_regex: return [] json_raw = regex_json_remove_end.sub(b'', regex_json_remove_start.sub(b'', json_regex.group())) json = loads(json_raw.decode('utf-8')) # parse results for result in json['Results'].get('items', []): result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '') # parse image results if result.get('ContentType', '').startswith('image'): img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) # append result results.append({'url': result['SourceUrl'], 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) # parse general results else: result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '') result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '') # append result results.append({'url': result_url, 'title': result_title, 'content': result_content}) # parse images for result in json.get('Images', []): # decode image url img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) # append result results.append({'url': result['SourceUrl'], 'title': result['Title'], 'content': '', 'img_src': img_url, 'template': 'images.html'}) # return results return results
def compare_urls(url_a, url_b): # ignore www. in comparison if url_a.netloc.startswith('www.'): host_a = url_a.netloc.replace('www.', '', 1) else: host_a = url_a.netloc if url_b.netloc.startswith('www.'): host_b = url_b.netloc.replace('www.', '', 1) else: host_b = url_b.netloc if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment: return False # remove / from the end of the url if required path_a = url_a.path[:-1]\ if url_a.path.endswith('/')\ else url_a.path path_b = url_b.path[:-1]\ if url_b.path.endswith('/')\ else url_b.path return unquote(path_a) == unquote(path_b)
def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=') + 1) for ending in endings: endpos = url_string.rfind(ending) if endpos > -1: endpositions.append(endpos) if start == 0 or len(endpositions) == 0: return url_string else: end = min(endpositions) return unquote(url_string[start:end])
def parse_url(url_string): endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=') + 1) for ending in endings: endpos = url_string.rfind(ending) if endpos > -1: endpositions.append(endpos) if start == 0 or len(endpositions) == 0: return url_string else: end = min(endpositions) return unquote(url_string[start:end])
def normalize_url(url): parsed_url = urlparse(url) # add a / at this end of the url if there is no path if not parsed_url.netloc: raise Exception('Cannot parse url') if not parsed_url.path: url += '/' # FIXME : hack for yahoo if parsed_url.hostname == 'search.yahoo.com'\ and parsed_url.path.startswith('/r'): p = parsed_url.path mark = p.find('/**') if mark != -1: return unquote(p[mark + 3:]).decode('utf-8') return url
def normalize_url(url): parsed_url = urlparse(url) # add a / at this end of the url if there is no path if not parsed_url.netloc: raise Exception('Cannot parse url') if not parsed_url.path: url += '/' # FIXME : hack for yahoo if parsed_url.hostname == 'search.yahoo.com'\ and parsed_url.path.startswith('/r'): p = parsed_url.path mark = p.find('/**') if mark != -1: return unquote(p[mark + 3:]).decode('utf-8') return url
def response(resp): results = [] matches = modelexport_re.search(resp.text) if matches is None: return results match = matches.group(1) model_export = loads(match) if 'legend' not in model_export: return results legend = model_export['legend'] # handle empty page if not legend or not legend[0]: return results for index in legend: photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])] author = unquote(photo.get('realname', '')) source = unquote(photo.get('username', '')) + ' @ Flickr' title = unquote(photo.get('title', '')) content = unquote(photo.get('description', '')) img_src = None # From the biggest to the lowest format for image_size in image_sizes: if image_size in photo['sizes']: img_src = photo['sizes'][image_size]['url'] img_format = 'jpg ' \ + str(photo['sizes'][image_size]['width']) \ + 'x' \ + str(photo['sizes'][image_size]['height']) break if not img_src: logger.debug('cannot find valid image size: {0}'.format(repr(photo))) continue # For a bigger thumbnail, keep only the url_z, not the url_n if 'n' in photo['sizes']: thumbnail_src = photo['sizes']['n']['url'] elif 'z' in photo['sizes']: thumbnail_src = photo['sizes']['z']['url'] else: thumbnail_src = img_src if 'ownerNsid' not in photo: # should not happen, disowned photo? Show it anyway url = img_src else: url = build_flickr_url(photo['ownerNsid'], photo['id']) results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, 'content': content, 'author': author, 'source': source, 'img_format': img_format, 'template': 'images.html'}) return results