Пример #1
0
 def test_get_sanitized_local_url(self):
     self.assertEqual(uu.get_sanitized_url('/foo/bar'), 'file:///foo/bar')
     self.assertEqual(uu.get_sanitized_url('file:///foo/bar'),
                      'file:///foo/bar')
     self.assertEqual(uu.get_sanitized_url('http://foo/bar'),
                      'http://foo/bar')
     self.assertEqual(uu.get_sanitized_url('foo/bar'), 'foo/bar')
Пример #2
0
    def __init__(self, input_url, output_url):
        input_url = get_sanitized_url(input_url)
        output_url = get_sanitized_url(output_url)

        if input_url.endswith('/'):
            scope_url = input_url
        else:
            (scope_url, _) = os.path.split(input_url)
            scope_url += '/'

        GenericSyncer.logger.debug('SCOPE: {0}'.format(scope_url))

        super(SingleURLSyncer, self).__init__(input_urls=input_url,
                                              output_url=output_url,
                                              scope=[scope_url])
Пример #3
0
    def __init__(self, input_url, output_url):
        input_url = get_sanitized_url(input_url)
        output_url = get_sanitized_url(output_url)

        if input_url.endswith('/'):
            scope_url = input_url
        else:
            (scope_url, _) = os.path.split(input_url)
            scope_url += '/'

        GenericSyncer.logger.debug('SCOPE: {0}'.format(scope_url))

        super(SingleURLSyncer, self).__init__(
                input_urls=input_url,
                output_url=output_url,
                scope=[scope_url])
Пример #4
0
    def __init__(self, input_url, output_url):
        input_url = get_sanitized_url(input_url)
        output_url = get_sanitized_url(output_url)
        scope = []
        base = os.path.split(input_url)[0]
        scope.append(base + '/gettingStarted.html')
        scope.append(base + '/keyboard-howto.html')
        scope.append(base + '/table-howto.html')
        scope.append(base + '/frame-howto.html')
        scope.append(base + '/window-howto.html')
        scope.append(base + '/javascript-howto.html')
        scope.append(base + '/activeX-howto.html')
        scope.append(base + '/logging.html')
        scope.append(base + '/faq.html')
        scope.append(base + '/index.html')

        super(HtmlUnitSyncer, self).__init__(input_urls=input_url,
                                             output_url=output_url,
                                             scope=scope)
Пример #5
0
    def __init__(self, input_url, output_url):
        input_url = get_sanitized_url(input_url)
        output_url = get_sanitized_url(output_url)
        scope = []
        base = os.path.split(input_url)[0]
        scope.append(base + '/gettingStarted.html')
        scope.append(base + '/keyboard-howto.html')
        scope.append(base + '/table-howto.html')
        scope.append(base + '/frame-howto.html')
        scope.append(base + '/window-howto.html')
        scope.append(base + '/javascript-howto.html')
        scope.append(base + '/activeX-howto.html')
        scope.append(base + '/logging.html')
        scope.append(base + '/faq.html')
        scope.append(base + '/index.html')

        super(HtmlUnitSyncer, self).__init__(
                input_urls=input_url,
                output_url=output_url,
                scope=scope)
Пример #6
0
 def process_page_links(self, tree, local_url, url):
     link_tags = self.links(tree)
     links = []
     for link_tag in link_tags:
         attributes = link_tag.attrib
         href = ''
         if 'href' in attributes:
             href = attributes['href']
             link_url = get_url_without_hash(urlparse.urljoin(url, href))
             local_url_to = get_local_url(self.output_url, link_url)
             local_url_to = get_sanitized_url(local_url_to)
             link = DocumentLink(link_url, local_url_to)
             links.append(link)
         else:
             continue
     return links
Пример #7
0
 def process_page_links(self, tree, local_url, url):
     link_tags = self.links(tree)
     links = []
     for link_tag in link_tags:
         attributes = link_tag.attrib
         href = ''
         if 'href' in attributes:
             href = attributes['href']
             link_url = get_url_without_hash(urlparse.urljoin(url, href))
             local_url_to = get_local_url(self.output_url, link_url)
             local_url_to = get_sanitized_url(local_url_to)
             link = DocumentLink(link_url, local_url_to)
             links.append(link)
         else:
             continue
     return links
Пример #8
0
def download_content(file_from_path, force=False, real_browser=False):
    url = get_sanitized_url(file_from_path)

    try:
        if not real_browser:
            file_from = get_file_from(url)
        else:
            file_from = get_file_from_real_browser(url)

        logger.info('Downloading {0}'.format(url))
        content = file_from.read()
        file_from.close()
        encoding = get_encoding(content)
        content = unicode(content, encoding)
        return (content, encoding)
    except Exception:
        logger.exception('Error while downloading a file: {0}'.format(url))
        raise RecoDocError('Error downloading {0}'.format(url))
Пример #9
0
def download_content(file_from_path, force=False, real_browser=False):
    url = get_sanitized_url(file_from_path)

    try:
        if not real_browser:
            file_from = get_file_from(url)
        else:
            file_from = get_file_from_real_browser(url)

        logger.info('Downloading {0}'.format(url))
        content = file_from.read()
        file_from.close()
        encoding = get_encoding(content)
        content = unicode(content, encoding)
        return (content, encoding)
    except Exception:
        logger.exception('Error while downloading a file: {0}'.format(
            url))
        raise RecoDocError('Error downloading {0}'.format(url))
Пример #10
0
def download_file(file_from_path,
                  file_to_path,
                  force=False,
                  binary=False,
                  real_browser=False):
    url = get_sanitized_url(file_from_path)

    if os.path.exists(file_to_path) and \
       os.path.getsize(file_to_path) > 0 and \
       not force:
        logger.info('Skipped downloading {0} because it already exists in '
                    '{1}'.format(url, file_to_path))
        return
    try:
        if not real_browser:
            file_from = get_file_from(url)
        else:
            file_from = get_file_from_real_browser(url)

        if binary:
            file_to = open(file_to_path, 'wb')
        else:
            file_to = codecs.open(file_to_path, 'w', encoding='utf8')

        logger.info('Downloading {0} to {1} in mode binary? {2}'.format(
            url, file_to_path, binary))
        if not binary:
            content = file_from.read()
            encoding = get_encoding(content)
            content = unicode(content, encoding)
            file_to.write(content)
            file_from.close()
            file_to.close()
        else:
            shutil.copyfileobj(file_from, file_to)
            file_from.close()
            file_to.close()
    except Exception:
        logger.info('Error while downloading a file: {0}'.format(url))
        if os.path.exists(file_to_path):
            os.remove(file_to_path)
        raise RecoDocError('Error downloading {0}'.format(url))
Пример #11
0
def download_file(file_from_path, file_to_path, force=False, binary=False,
        real_browser=False):
    url = get_sanitized_url(file_from_path)

    if os.path.exists(file_to_path) and \
       os.path.getsize(file_to_path) > 0 and \
       not force:
        logger.info('Skipped downloading {0} because it already exists in '
                '{1}'.format(url, file_to_path))
        return
    try:
        if not real_browser:
            file_from = get_file_from(url)
        else:
            file_from = get_file_from_real_browser(url)

        if binary:
            file_to = open(file_to_path, 'wb')
        else:
            file_to = codecs.open(file_to_path, 'w', encoding='utf8')

        logger.info('Downloading {0} to {1} in mode binary? {2}'.format(url,
            file_to_path, binary))
        if not binary:
            content = file_from.read()
            encoding = get_encoding(content)
            content = unicode(content, encoding)
            file_to.write(content)
            file_from.close()
            file_to.close()
        else:
            shutil.copyfileobj(file_from, file_to)
            file_from.close()
            file_to.close()
    except Exception:
        logger.info('Error while downloading a file: {0}'.format(
            url))
        if os.path.exists(file_to_path):
            os.remove(file_to_path)
        raise RecoDocError('Error downloading {0}'.format(url))
Пример #12
0
 def test_get_sanitized_local_url(self):
     self.assertEqual(uu.get_sanitized_url("/foo/bar"), "file:///foo/bar")
     self.assertEqual(uu.get_sanitized_url("file:///foo/bar"), "file:///foo/bar")
     self.assertEqual(uu.get_sanitized_url("http://foo/bar"), "http://foo/bar")
     self.assertEqual(uu.get_sanitized_url("foo/bar"), "foo/bar")