Exemplo n.º 1
0
def _generate_index(source, logger):
    # Take contents of source, look for lists of tools and commands,
    # and insert tools and commands from bundles that come with
    # documentation
    from chimerax import app_dirs
    user_dir = os.path.join(app_dirs.user_cache_dir, 'docs', 'user')
    path = os.path.join(user_dir, 'index.html')
    if os.path.exists(path):
        return path
    os.makedirs(user_dir, exist_ok=True)

    from chimerax.core import toolshed
    ts = toolshed.get_toolshed()
    if ts is None:
        return None
    # Look for <div id="foobar">
    import lxml.html
    html = lxml.html.parse(source)
    for node in html.iterfind(".//div[@id]"):
        ident = node.attrib["id"]
        if ident == "clist":
            _update_list(ts, node, 'commands', _update_commands, logger)
        elif ident == "tlist":
            _update_list(ts, node, 'tools', _update_tools, logger)
    data = lxml.html.tostring(html)
    os.makedirs(user_dir, exist_ok=True)
    with open(path, 'wb') as f:
        f.write(data)
    return path
Exemplo n.º 2
0
    def handle(self, **options):
        self.stdout.write('Loading file %s...' % options['html_file'])
        self.stdout.write('Save %s...' % options['save'])
        save = options['save']
        #save = True
        html = lxml.html.parse(options['html_file'])
        tables = html.findall(".//table")
        self.stdout.write('nb tables %s' % len(tables))
        for table in html.iterfind(".//table"):
            self.error_warning=0

            # First child must be a caption
            self.caption = table[0]

            # Next row: parse brand name, url, and date
            row = self.caption.getnext()
            if row[0].get('colspan') is not '5':
                 self.stdout.write('\tNot a brand table')
                 continue
            if row[1].get('colspan') is not '5':
                 self.stdout.write('\tNot a brand table')
                 continue

            # Parse Brand
            self.parseBrand(row[1])
            brand_note = None
            new_company = None
            if self.brand_note != []:
                brand_note = Note(note = '. '.join(self.brand_note))
                self.stdout.write("\tNote: %s" % brand_note.note)
                if save:
                    brand_note.save()
            if save:
                new_company = Company(  name=self.caption.text, 
                                        validation_date = self.date,
                                        note = brand_note,
                                        certification = ' / '.join(self.certification) )
                new_company.save()
            #new_company = Company.objects.get( name=self.caption.text )
            for name in self.brand_name.split('/'):
                if save:
                    brand = Brand(name=name.strip(), company=new_company)
                    brand.save()
            for url in self.url:
                url = url.strip('/')
                if save:
                    new_site = Site(domain=url, company=new_company)
                    new_site.save()

            # Next row must be for table header
            row = row.getnext()
            text = row.find("td").xpath("string()")
            if text != "Description":
                self.stdout.write(red+'Header table first column %s is not Description' % text+reset)

            # Parse products
            self.parseProducts(row, new_company, options['save'])
Exemplo n.º 3
0
def iter_links(body):
    try:
        html = lxml.html.fromstring(body)
    except (lxml.etree.ParseError, lxml.etree.ParserError) as exc:
        logger.warn(exc)
        return

    for link in html.iterfind('.//a'):
        base = None
        href = link.attrib.get('href')

        if not href:
            continue

        while '../' in href:
            if '://' not in href:
                if base is None:
                    try:
                        base = html.find('.//base').attrib['href']
                    except BaseException:
                        base = ""
                    else:
                        base = base.rstrip('/') + '/'

                if base:
                    href = base + href.lstrip('/')
                    href = '/' + href.split('://', 1)[1].split('/', 1)[-1]

            i = href.find('../')
            assert i > -1

            if i == 0:
                continue

            previous = href.rfind('/', 0, i - 1)
            after = href[i + 3:]

            if previous == -1:
                href = after
            else:
                href = href[:previous] + "/" + after

        href = href.split('#', 1)[0]
        href = href.split('?', 1)[0]

        if href:
            yield href
def iter_links(body):
    try:
        html = lxml.html.fromstring(body)
    except (lxml.etree.ParseError, lxml.etree.ParserError) as exc:
        logger.warn(exc)
        return

    for link in html.iterfind('.//a'):
        base = None
        href = link.attrib.get('href')

        if not href:
            continue

        while '../' in href:
            if '://' not in href:
                if base is None:
                    try:
                        base = html.find('.//base').attrib['href']
                    except BaseException:
                        base = ""
                    else:
                        base = base.rstrip('/') + '/'

                if base:
                    href = base + href.lstrip('/')
                    href = '/' + href.split('://', 1)[1].split('/', 1)[-1]

            i = href.find('../')
            assert i > -1

            if i == 0:
                continue

            previous = href.rfind('/', 0, i - 1)
            after = href[i + 3:]

            if previous == -1:
                href = after
            else:
                href = href[:previous] + "/" + after

        href = href.split('#', 1)[0]
        href = href.split('?', 1)[0]

        if href:
            yield href
Exemplo n.º 5
0
    def parse_html(self, url):
        page = url.split('articles/')[-1]
        if self.base_path.joinpath(page).exists():
            html = lxml.html.parse(page)
            logging.info('HTML page `{}` exists, and parses.'.format(url))

            # Dateline is in the first p, unless that is an image, then it is in the third.
            dateline = html.find('.//{*}p')
            if dateline.text is None:
                dateline = html.findall('.//{*}p')[2]
            if 'BLACKSBURG, Va.' in dateline.text:
                self.spatial_coverage = 'Blacksburg, Va.'
            else:
                date_issued = self.date_issued.strftime(', %b')
                self.spatial_coverage = dateline.text.split(date_issued)[0].title()
            if len(self.spatial_coverage) > 25 or '\n' in self.spatial_coverage or ' ' == self.spatial_coverage:
                # Sanity check: These are symptoms of errors. Change them to Blacksburg.
                self.spatial_coverage = 'Blacksburg, Va.'
            logging.debug('Spatial Coverage: {}'.format(self.spatial_coverage))

            # Author is in the first li of the last ul, or the one before that, if it exists.
            html_lists = html.findall('.//{*}ul')
            author = html_lists[-1].find('./{*}li').text
            if author is None:
                try:
                    author = html_lists[-2].find('./{*}li').text
                except IndexError as e:
                    logging.error('No author found.')
            if author is not None:
                author = ' '.join(author.split())
            self.author = author
            logging.debug('Author: {}'.format(self.author))

            # Any img tag is a related file.
            for image in html.iterfind('.//{*}img'):
                self.image_urls.add(image.get('src'))
            if len(self.image_urls) > 0:
                logging.debug('All image urls: {}'.format(self.image_urls))
        else:
            logging.error('Url `{}` does not map to an HTML file in the archive.'.format(url))
            self.error_urls.add(url)