示例#1
0
def xpath_iter(xpath):
    """Return an iterator of the xpath parsed into the separator, tag, index, and attributes

    >>> list(xpath_iter('/div[1]//span[@class="text"]'))
    [('', 'div', 1, []), ('/', 'span', None, [('class', 'text')])]
    >>> list(xpath_iter('//li[-2]'))
    [('/', 'li', -2, [])]
    >>> list(xpath_iter('/div[@id="content"]//span[1][@class="text"][@title=""]/a'))
    [('', 'div', None, [('id', 'content')]), ('/', 'span', 1, [('class', 'text'), ('title', '')]), ('', 'a', None, [])]
    """
    for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath):
        index, attributes = None, []
        if '[' in token:
            tag = token[:token.find('[')]
            for attribute in re.compile('\[(.*?)\]').findall(token):
                try:
                    index = int(attribute)
                except ValueError:
                    match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search(
                        attribute)
                    if match:
                        key, value = match.groups()
                        attributes.append((key.lower(), value.lower()))
                    else:
                        raise common.WebScrapingError('Unknown format: ' +
                                                      attribute)
        else:
            tag = token
        yield separator, tag, index, attributes
示例#2
0
    def parse(self, xpath):
        """Parse the xpath into: counter, separator, tag, index, and attributes

        >>> doc = Doc('')
        >>> doc.parse('/div[1]//span[@class="text"]')
        [(0, '', 'div', 1, []), (1, '/', 'span', None, [('class', 'text')])]
        >>> doc.parse('//li[-2]')
        [(0, '/', 'li', -2, [])]
        >>> doc.parse('/div[@id="content"]//span[1][@class="text"][@title=""]/a')
        [(0, '', 'div', None, [('id', 'content')]), (1, '/', 'span', 1, [('class', 'text'), ('title', '')]), (2, '', 'a', None, [])]
        """
        tokens = []
        counter = 0
        for separator, token in re.compile('(|/|\.\.)/([^/]+)').findall(xpath):
            index, attributes = None, []
            if '[' in token:
                tag = token[:token.find('[')]
                for attribute in re.compile('\[(.*?)\]').findall(token):
                    try:
                        index = int(attribute)
                    except ValueError:
                        match = re.compile('@(.*?)=["\']?(.*?)["\']?$').search(
                            attribute)
                        if match:
                            key, value = match.groups()
                            attributes.append((key.lower(), value.lower()))
                        else:
                            raise common.WebScrapingError('Unknown format: ' +
                                                          attribute)
            else:
                tag = token
            tokens.append((counter, separator, tag, index, attributes))
            counter += 1
        return tokens
示例#3
0
def get_earth_radius(scale):
    if scale is None:
        return 1.0
    elif scale == 'km':
        return 6373.0
    elif scale == 'miles':
        return 3960.0
    else:
        raise common.WebScrapingError('Invalid scale: %s' % str(scale))
示例#4
0
    def _xpath(self, path, html, limit):
        """Recursively search HTML for content at XPath
        """
        counter, separator, tag, index, attributes = path.pop(0)
        if counter == 0:
            self.num_searches += 1

        results = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
            results.append(self.get_parent(html))
        elif tag == 'text()':
            # extract child text
            text = self._get_content(self._get_html(html))
            results.append(common.remove_tags(text, keep_children=False))
            # check if next tag is selecting attribute
        elif tag.startswith('@'):
            attr = tag[1:].lower()
            #parent = self.get_parent(context)
            value = self._get_attributes(html).get(attr, '')
            results.append(value)
        else:
            # have tag
            if counter > 0:
                # get child html when not at root
                html = self._get_content(html)

            # search direct children if / and all descendants if //
            search_fn = self._find_children if separator == '' else self._find_descendants
            matches = search_fn(html, tag)

            # support negative indices
            if index is not None and index < 0:
                matches = list(matches)
                index += len(matches) + 1

            for child_i, child in enumerate(matches):
                # check if matches index
                if index is None or index == child_i + 1:
                    # check if matches attributes
                    if not attributes or self._match_attributes(
                            attributes, self._get_attributes(child)):
                        if path:
                            results.extend(self._xpath(path[:], child, limit))
                        else:
                            # final node
                            results.append(self._get_content(child))
                        if len(results) > limit:
                            break

            #if not children:
            #    attributes_s = attributes and ''.join('[@%s="%s"]' % a for a in attributes) or ''
            #    common.logger.debug('No matches for <%s%s%s> (tag %d)' % (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
        return results
示例#5
0
    def _find_descendants(self, html, tag):
        """Find descendants with this tag type

        >>> doc = Doc('')
        >>> list(doc._find_descendants('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div'))
        ['<div>abc<div>def</div>abc</div>', '<div>def</div>', '<div>jkl</div>']
        """
        # XXX search with attribute here
        if tag == '*':
            raise common.WebScrapingError("`*' not currently supported for //")
        for match in re.compile('<%s' % tag, re.DOTALL | re.IGNORECASE).finditer(html):
            tag_html = html[match.start():]
            tag_html, _ = self._split_tag(tag_html)
            yield tag_html
示例#6
0
文件: alg.py 项目: yuzi3150/SeatPJ2
def distance(p1, p2, scale=None):
    """Calculate distance between 2 (latitude, longitude) points.

    scale:
        By default the distance will be returned as a ratio of the earth's radius
        Use 'km' to return distance in kilometres, 'miles' to return distance in miles

    >>> melbourne = -37.7833, 144.9667
    >>> san_francisco = 37.7750, -122.4183
    >>> int(distance(melbourne, san_francisco, 'km'))
    12659
    """
    if p1 == p2:
        return 0
    lat1, long1 = p1
    lat2, long2 = p2
    # Convert latitude and longitude to
    # spherical coordinates in radians.
    degrees_to_radians = math.pi / 180.0

    # phi = 90 - latitude
    phi1 = (90.0 - lat1) * degrees_to_radians
    phi2 = (90.0 - lat2) * degrees_to_radians

    # theta = longitude
    theta1 = long1 * degrees_to_radians
    theta2 = long2 * degrees_to_radians

    # Compute spherical distance from spherical coordinates.

    # For two locations in spherical coordinates
    # (1, theta, phi) and (1, theta, phi)
    # cosine( arc length ) =
    #    sin phi sin phi' cos(theta-theta') + cos phi cos phi'
    # distance = rho * arc length

    cos = (math.sin(phi1) * math.sin(phi2) * math.cos(theta1 - theta2) +
           math.cos(phi1) * math.cos(phi2))
    arc = math.acos(cos)

    if scale is None:
        return arc
    elif scale == 'km':
        return arc * 6373
    elif scale == 'miles':
        return arc * 3960
    else:
        raise common.WebScrapingError('Invalid scale: %s' % str(scale))
示例#7
0
def find_descendants(html, tag):
    """Find descendants with this tag type

    >>> [str(b) for b in find_descendants('<span>1</span><div>abc<div>def</div>abc</div>ghi<div>jkl</div>', 'div')]
    ['<div>abc<div>def</div>abc</div>', '<div>def</div>', '<div>jkl</div>']
    """
    if tag == '*':
        raise common.WebScrapingError(
            "`*' not currently supported for // because too inefficient")
    results = []
    for match in re.compile('<%s' % tag,
                            re.DOTALL | re.IGNORECASE).finditer(html):
        if USE_BUFFER:
            tag_html = buffer(html, match.start())
        else:
            tag_html = html[match.start():]
        tag_html, _ = split_tag(tag_html)
        results.append(tag_html)
    return results
示例#8
0
def search(html, xpath, remove=None):
    """Query HTML document using XPath
    
    remove is a list of tags to ignore

    >>> search('<span>1</span><div>abc<a>LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a>LINK 3</a>jkl</div>', '/div/a')
    ['LINK 1', 'LINK 3']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]/a[@class="link"]')
    ['LINK 1']
    >>> search('<div>abc<a class="link">LINK 1</a><div><a>LINK 2</a>def</div>abc</div>ghi<div><a class="link">LINK 3</a>jkl</div>', '/div[1]//a')
    ['LINK 1', 'LINK 2']
    >>> search('<div>abc<a class="link">LINK 1</a></div>', '/div/a/@class')
    ['link']

    # test searching unicode
    >>> search(u'<a href="http://www.google.com" class="flink">google</a>', '//a[@class="flink"]')
    [u'google']

    # test scraping a large amount of content
    len(search('<div><span>!</span></div>' * 10000, '//span'))
    10000
    """
    orig_html = html
    html = clean_html(html, remove)
    contexts = [html]  # initial context is entire webpage
    parent_attributes = []
    for tag_i, (separator, tag, index,
                attributes) in enumerate(xpath_iter(xpath)):
        children = []
        if tag == '..':
            # parent
            raise common.WebScrapingError('.. not yet supported')
        elif tag == 'text()':
            # extract child text
            for context in contexts:
                children.append(
                    common.remove_tags(context, keep_children=False))
        elif tag.startswith('@'):
            # selecting attribute
            name = tag[1:].lower()
            for a in parent_attributes:
                children.append(a.get(name, ''))
        else:
            # have tag
            parent_attributes = []
            for context in contexts:
                # search direct children if / and all descendants if //
                matches = (separator == '' and find_children
                           or find_descendants)(context, tag)
                # XXX change to iterator
                abs_index = index
                if abs_index is not None and abs_index < 0:
                    # support negative indices
                    abs_index += len(matches) + 1
                for child_i, child in enumerate(matches):
                    if index is None or abs_index == child_i + 1:
                        # matches index if defined
                        child_attributes = get_attributes(child)
                        if match_attributes(attributes, child_attributes):
                            # child matches tag and any defined indices or attributes
                            children.append(get_content(child))
                            parent_attributes.append(child_attributes)
        if not children and tag == 'tbody':
            pass  # skip tbody, which firefox includes in xpath when does not exist
        else:
            contexts = children
        if not contexts:
            attributes_s = attributes and ''.join('[@%s="%s"]' % a
                                                  for a in attributes) or ''
            common.logger.debug(
                'No matches for <%s%s%s> (tag %d)' %
                (tag, index and '[%d]' % index or '', attributes_s, tag_i + 1))
            break
    return contexts