def xpath_iter(xpath): """Return an iterator of the xpath parsed into the separator, tag, index, and attributes >>> list(xpath_iter('/div[1]//span[@class="text"]')) [('', 'div', 1, []), ('/', 'span', None, [('class', 'text')])] >>> list(xpath_iter('//li[-2]')) [('/', 'li', -2, [])] >>> list(xpath_iter('/div[@id="content"]//span[1][@class="text"][@title=""]/a')) [('', 'div', None, [('id', 'content')]), ('/', 'span', 1, [('class', 'text'), ('title', '')]), ('', 'a', None, [])] """ for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath): index, attributes = None, [] if '[' in token: tag = token[:token.find('[')] for attribute in re.compile('\[(.*?)\]').findall(token): try: index = int(attribute) except ValueError: match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search( attribute) if match: key, value = match.groups() attributes.append((key.lower(), value.lower())) else: raise common.WebScrapingError('Unknown format: ' + attribute) else: tag = token yield separator, tag, index, attributes
def parse(self, xpath): """Parse the xpath into: counter, separator, tag, index, and attributes >>> doc = Doc('') >>> doc.parse('/div[1]//span[@class="text"]') [(0, '', 'div', 1, []), (1, '/', 'span', None, [('class', 'text')])] >>> doc.parse('//li[-2]') [(0, '/', 'li', -2, [])] >>> doc.parse('/div[@id="content"]//span[1][@class="text"][@title=""]/a') [(0, '', 'div', None, [('id', 'content')]), (1, '/', 'span', 1, [('class', 'text'), ('title', '')]), (2, '', 'a', None, [])] """ tokens = [] counter = 0 for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath): index, attributes = None, [] if '[' in token: tag = token[:token.find('[')] for attribute in re.compile('\[(.*?)\]').findall(token): try: index = int(attribute) except ValueError: match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search( attribute) if match: key, value = match.groups() attributes.append((key.lower(), value.lower())) else: raise common.WebScrapingError('Unknown format: ' + attribute) else: tag = token tokens.append((counter, separator, tag, index, attributes)) counter += 1 return tokens
def get_earth_radius(scale): if scale is None: return 1.0 elif scale == 'km': return 6373.0 elif scale == 'miles': return 3960.0 else: raise common.WebScrapingError('Invalid scale: %s' % str(scale))
def _xpath(self, path, html, limit): """Recursively search HTML for content at XPath """ counter, separator, tag, index, attributes = path.pop(0) if counter == 0: self.num_searches += 1 results = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') results.append(self.get_parent(html)) elif tag == 'text()': # extract child text text = self._get_content(self._get_html(html)) results.append(common.remove_tags(text, keep_children=False)) # check if next tag is selecting attribute elif tag.startswith('@'): attr = tag[1:].lower() #parent = self.get_parent(context) value = self._get_attributes(html).get(attr, '') results.append(value) else: # have tag if counter > 0: # get child html when not at root html = self._get_content(html) # search direct children if / and all descendants if // search_fn = self._find_children if separator == '' else self._find_descendants matches = search_fn(html, tag) # support negative indices if index is not None and index < 0: matches = list(matches) index += len(matches) + 1 for child_i, child in enumerate(matches): # check if matches index if index is None or index == child_i + 1: # check if matches attributes if not attributes or self._match_attributes( attributes, self._get_attributes(child)): if path: results.extend(self._xpath(path[:], child, limit)) else: # final node results.append(self._get_content(child)) if len(results) > limit: break #if not children: # attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' # common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) return results
def _find_descendants(self, html, tag): """Find descendants with this tag type >>> doc = Doc('') >>> list(doc._find_descendants('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div')) ['<div>abc<div>def</div>abc</div>', '<div>def</div>', '<div>jkl</div>'] """ # XXX search with attribute here if tag == '*': raise common.WebScrapingError("`*' not currently supported for //") for match in re.compile('<%s' % tag, re.DOTALL | re.IGNORECASE).finditer(html): tag_html = html[match.start():] tag_html, _ = self._split_tag(tag_html) yield tag_html
def distance(p1, p2, scale=None): """Calculate distance between 2 (latitude, longitude) points. scale: By default the distance will be returned as a ratio of the earth's radius Use 'km' to return distance in kilometres, 'miles' to return distance in miles >>> melbourne = -37.7833, 144.9667 >>> san_francisco = 37.7750, -122.4183 >>> int(distance(melbourne, san_francisco, 'km')) 12659 """ if p1 == p2: return 0 lat1, long1 = p1 lat2, long2 = p2 # Convert latitude and longitude to # spherical coordinates in radians. degrees_to_radians = math.pi / 180.0 # phi = 90 - latitude phi1 = (90.0 - lat1) * degrees_to_radians phi2 = (90.0 - lat2) * degrees_to_radians # theta = longitude theta1 = long1 * degrees_to_radians theta2 = long2 * degrees_to_radians # Compute spherical distance from spherical coordinates. # For two locations in spherical coordinates # (1, theta, phi) and (1, theta, phi) # cosine( arc length ) = # sin phi sin phi' cos(theta-theta') + cos phi cos phi' # distance = rho * arc length cos = (math.sin(phi1) * math.sin(phi2) * math.cos(theta1 - theta2) + math.cos(phi1) * math.cos(phi2)) arc = math.acos(cos) if scale is None: return arc elif scale == 'km': return arc * 6373 elif scale == 'miles': return arc * 3960 else: raise common.WebScrapingError('Invalid scale: %s' % str(scale))
def find_descendants(html, tag): """Find descendants with this tag type >>> [str(b) for b in find_descendants('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div')] ['<div>abc<div>def</div>abc</div>', '<div>def</div>', '<div>jkl</div>'] """ if tag == '*': raise common.WebScrapingError( "`*' not currently supported for // because too inefficient") results = [] for match in re.compile('<%s' % tag, re.DOTALL | re.IGNORECASE).finditer(html): if USE_BUFFER: tag_html = buffer(html, match.start()) else: tag_html = html[match.start():] tag_html, _ = split_tag(tag_html) results.append(tag_html) return results
def search(html, xpath, remove=None): """Query HTML document using XPath remove is a list of tags to ignore >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a') ['LINK 1', 'LINK 3'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]') ['LINK 1'] >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a') ['LINK 1', 'LINK 2'] >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class') ['link'] # test searching unicode >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]') [u'google'] # test scraping a large amount of content len(search('<div><span>!</span></div>' * 10000, '//span')) 10000 """ orig_html = html html = clean_html(html, remove) contexts = [html] # initial context is entire webpage parent_attributes = [] for tag_i, (separator, tag, index, attributes) in enumerate(xpath_iter(xpath)): children = [] if tag == '..': # parent raise common.WebScrapingError('.. not yet supported') elif tag == 'text()': # extract child text for context in contexts: children.append( common.remove_tags(context, keep_children=False)) elif tag.startswith('@'): # selecting attribute name = tag[1:].lower() for a in parent_attributes: children.append(a.get(name, '')) else: # have tag parent_attributes = [] for context in contexts: # search direct children if / and all descendants if // matches = (separator == '' and find_children or find_descendants)(context, tag) # XXX change to iterator abs_index = index if abs_index is not None and abs_index < 0: # support negative indices abs_index += len(matches) + 1 for child_i, child in enumerate(matches): if index is None or abs_index == child_i + 1: # matches index if defined child_attributes = get_attributes(child) if match_attributes(attributes, child_attributes): # child matches tag and any defined indices or attributes children.append(get_content(child)) parent_attributes.append(child_attributes) if not children and tag == 'tbody': pass # skip tbody, which firefox includes in xpath when does not exist else: contexts = children if not contexts: attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or '' common.logger.debug( 'No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1)) break return contexts