Exemplo n.º 1
0
 def _toc_from_html(self, opf):
     if 'toc' not in self.oeb.guide:
         return False
     self.log.debug('Reading TOC from HTML...')
     itempath, frag = urldefrag(self.oeb.guide['toc'].href)
     item = self.oeb.manifest.hrefs[itempath]
     html = item.data
     if frag:
         elems = xpath(html, './/*[@id="%s"]' % frag)
         if not elems:
             elems = xpath(html, './/*[@name="%s"]' % frag)
         elem = elems[0] if elems else html
         while elem != html and not xpath(elem, './/h:a[@href]'):
             elem = elem.getparent()
         html = elem
     titles = defaultdict(list)
     order = []
     for anchor in xpath(html, './/h:a[@href]'):
         href = anchor.attrib['href']
         href = item.abshref(urlnormalize(href))
         path, frag = urldefrag(href)
         if path not in self.oeb.manifest.hrefs:
             continue
         title = xml2text(anchor)
         title = COLLAPSE_RE.sub(' ', title.strip())
         if href not in titles:
             order.append(href)
         titles[href].append(title)
     toc = self.oeb.toc
     for href in order:
         toc.add(' '.join(titles[href]), href)
     return True
Exemplo n.º 2
0
def get_links(url, depth, atmost_count):
	urldfg = urlparse.urldefrag(url)
	url = urldfg[0]
	urls_list = []
	myopener = MyOpener()
	try:
		page = myopener.open(url)
	except:
		return []
 
	text = page.read()
	page.close()
	url_parsed = urlparse.urlparse(url)
	domain_name_url_arr = url_parsed.netloc.split(".")
	soup = BeautifulSoup(text, "html.parser")
 	for tag in soup.findAll('a', href=True):
		if atmost_count == 0:
			break;
		tag['href'] = urlparse.urljoin(url, tag['href'])
		new_url = urlparse.urldefrag(tag['href'])[0]
		new_url_parsed = urlparse.urlparse(new_url)
		domain_name_new_url_arr = new_url_parsed.netloc.split('.');
		if len(domain_name_url_arr) >= 2 and len(domain_name_new_url_arr) >= 2:
			if domain_name_url_arr[-1] != domain_name_new_url_arr[-1] or domain_name_url_arr[-2] != domain_name_new_url_arr[-2]:
				continue;
		else:
			continue;
		if new_url[-4:] == '.pdf':
			continue;
		if new_url not in urls_list:
			urls_list.append([new_url, depth + 1])
		atmost_count -= 1;
	return urls_list
Exemplo n.º 3
0
def make_correct_link(base, link):
    """
    makes links correct:
        http://... -- pass
        /...       -- adding link to site before
        www. ...   -- adding http:// before
        smth.html  -- adding full path before

        it's planned to be a wrapper over urlparse's functions
        done in order to handle all possible cases of url presentation

        handles absolute url and relative url
        clean up all 'fragments' in url (like http://site.ru/1.html#4 -> http://site.ru/1.html)
    """

    defrag_link, _ = urlparse.urldefrag(link)
    defrag_base, _ = urlparse.urldefrag(base)
    # case 'g.html  on http://ya.ru/a   ==>  http://ya.ru/a/g.html
    # (added slash after a if it's a unslashed folder
    # defining unslashed folder: empty query (like 'a.php?set=1'),
    #                        no  dots
    #                       no closing slash
    scheme, netloc, url, params, query, fragment = urlparse.urlparse(defrag_base)
    if url and not query and not re.search("/$", url) and not re.search("\.", url ):
        url += '/'
        defrag_base = urlparse.urlunparse( (scheme, netloc, url, params, query, fragment) )
    #just  rejoining all parts
    return_link = urlparse.urljoin(defrag_base, defrag_link)
    return return_link
Exemplo n.º 4
0
    def __init__(self, toc, j, renderlist, redirects):
        self.typedoc = StringIO.StringIO()
        self.toc = toc
        self.subs = {}  # type: Dict
        self.docParent = {}  # type: Dict
        self.docAfter = {}  # type: Dict
        self.rendered = set()  # type: Set
        self.redirects = redirects
        self.title = None  # type: str

        for t in j:
            if "extends" in t:
                for e in aslist(t["extends"]):
                    add_dictlist(self.subs, e, t["name"])
                    #if "docParent" not in t and "docAfter" not in t:
                    #    add_dictlist(self.docParent, e, t["name"])

            if t.get("docParent"):
                add_dictlist(self.docParent, t["docParent"], t["name"])

            if t.get("docChild"):
                for c in aslist(t["docChild"]):
                    add_dictlist(self.docParent, t["name"], c)

            if t.get("docAfter"):
                add_dictlist(self.docAfter, t["docAfter"], t["name"])

        _, _, metaschema_loader = schema.get_metaschema()
        alltypes = schema.extend_and_specialize(j, metaschema_loader)

        self.typemap = {}  # type: Dict
        self.uses = {}  # type: Dict
        self.record_refs = {}  # type: Dict
        for t in alltypes:
            self.typemap[t["name"]] = t
            try:
                if t["type"] == "record":
                    self.record_refs[t["name"]] = []
                    for f in t.get("fields", []):
                        p = has_types(f)
                        for tp in p:
                            if tp not in self.uses:
                                self.uses[tp] = []
                            if (t["name"], f["name"]) not in self.uses[tp]:
                                _, frg1 = urlparse.urldefrag(t["name"])
                                _, frg2 = urlparse.urldefrag(f["name"])
                                self.uses[tp].append((frg1, frg2))
                            if tp not in basicTypes and tp not in self.record_refs[t["name"]]:
                                    self.record_refs[t["name"]].append(tp)
            except KeyError as e:
                _logger.error("Did not find 'type' in %s", t)
                raise

        for f in alltypes:
            if (f["name"] in renderlist or
                ((not renderlist) and
                 ("extends" not in f) and
                 ("docParent" not in f) and
                 ("docAfter" not in f))):
                self.render_type(f, 1)
Exemplo n.º 5
0
def crawlWeb(UrlafterConnect,keyword):
	if not UrlafterConnect:
		print("Url is empty")
		return list()
	#Get all the links
	soup = BeautifulSoup(UrlafterConnect)
	urllist = []
	#check for the existence of keyword IR and crawl on those urls
	if re.search(keyword, str(soup), re.IGNORECASE) != None:
		for link in soup.find_all('a', href=True):
			crawl =  link.get('href')
			crawl_url = crawl.encode('utf-8')
			if not crawl_url:
				continue
		#links present in the same directory of /wiki, if so convert them to http form
			if crawl_url.startswith('/wiki'):
				if (crawl_url.find(':') == -1) and (crawl_url != "/wiki/Main_Page"):
					crawl_url = urlparse.urljoin("http://en.wikipedia.org",crawl_url)
					crawl_url, frag = urlparse.urldefrag(crawl_url)
					urllist.append(crawl_url)
			else:
			#Get only wiki links without colons in it and not redirecting to main page
				if crawl_url.startswith('http://en.wikipedia.org'):
					if crawl_url != "http://en.wikipedia.org/wiki/Main_Page":
						s = "http://en"
						crawl = crawl_url.lstrip("http://en")
						if crawl.find(':') == -1:
							crawl_url, frag = urlparse.urldefrag(crawl_url)
							urllist.append(crawl_url)
	#Remove duplicate entries from the list while returning
	return list(set(urllist))
Exemplo n.º 6
0
 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
Exemplo n.º 7
0
def _urljoin(base, url):
    """
    Construct a full ("absolute") URL by combining a "base URL" with another
    URL. Informally, this uses components of the base URL, in particular the
    addressing scheme, the network location and (part of) the path, to provide
    missing components in the relative URL.

    Additionally, the fragment identifier is preserved according to the HTTP
    1.1 bis draft.

    @type base: C{bytes}
    @param base: Base URL.

    @type url: C{bytes}
    @param url: URL to combine with C{base}.

    @return: An absolute URL resulting from the combination of C{base} and
        C{url}.

    @see: L{urlparse.urljoin}

    @see: U{https://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-22#section-7.1.2}
    """
    base, baseFrag = urldefrag(base)
    url, urlFrag = urldefrag(urljoin(base, url))
    return urljoin(url, b'#' + (urlFrag or baseFrag))
Exemplo n.º 8
0
    def parse_showings_table(self, response):
        movie_title = response.meta['movieTitle']
        movie_url = response.meta['movieUrl']
        showings_table_value = response.meta['showingsTableValue']
        theater_url = response.meta['theaterUrl']
        version = response.meta['version']

        showings_table = response.xpath('//div[@class="cinema-movie clearfix"]/div[@value="' + showings_table_value + '"]')
        at_least_one_showing_found = False

        jump_links = showings_table.css('.jump-to-show').xpath('a')
        if len(jump_links) >= 1:
            jump_link = jump_links[-1]
            if jump_link.xpath('text()').extract_first().endswith(u'>'):
                jump_url = urldefrag(response.urljoin(jump_link.xpath('@href').extract_first()))[0]
                request = scrapy.Request(jump_url, callback=self.parse_showings_table)
                request.meta['movieTitle'] = movie_title
                request.meta['movieUrl'] = movie_url
                request.meta['showingsTableValue'] = showings_table_value
                request.meta['theaterUrl'] = theater_url
                request.meta['version'] = version
                yield request
        else:
            for showings_column in showings_table.css('.cinema-movie-dates').xpath('li'):
                for showing_cell in showings_column.xpath('ul/li/a'):
                    at_least_one_showing_found = True
                    dayAndMonth = showings_column.xpath('div[2]/text()').extract_first().split('/')
                    day = int(dayAndMonth[0])
                    month = int(dayAndMonth[1])
                    hourAndMinute = showing_cell.xpath('text()').extract_first().split(':')
                    hour = int(hourAndMinute[0])
                    minute = int(hourAndMinute[1])
                    #seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):len('</div>')]
                    seating_info = showing_cell.xpath('@title').extract_first()[len('<div>'):-len('</div>')].split('</div><div>')
                    date_obj = datetime(datetime.now().year, month, day, hour, minute)
                    if date_obj < datetime.now():
                        date_obj = datetime(datetime.now().year + 1, month, day, hour, minute)

                    showing = ShowingItem()
                    showing['movieTitle'] = movie_title
                    showing['movieUrl'] = movie_url
                    showing['theaterUrl'] = theater_url
                    showing['seatingInfo'] = seating_info
                    showing['showingUrl'] = response.urljoin(showing_cell.xpath('@href').extract_first())
                    showing['start'] = date_obj.strftime('%Y-%m-%dT%H:%M:00')
                    showing['version'] = version
                    yield showing

            if at_least_one_showing_found:
                next_page = showings_table.css('.showtimes-extra').xpath('a[last()]')
                if next_page:
                    next_page_url = urldefrag(response.urljoin(next_page.xpath('@href')[0].extract()))[0]
                    request = scrapy.Request(next_page_url, callback=self.parse_showings_table)
                    request.meta['movieTitle'] = movie_title
                    request.meta['movieUrl'] = movie_url
                    request.meta['showingsTableValue'] = showings_table_value
                    request.meta['theaterUrl'] = theater_url
                    request.meta['version'] = version
                    yield request
Exemplo n.º 9
0
def get_links(response):
    if 300 <= response.status_code < 400 and response.headers['location']:
        # redirect
        yield urlparse.urldefrag(urlparse.urljoin(response.url, response.headers['location'], False))[0]
    try:
        html = beautify(response)
        for i in html.findAll('a', href=True):
            yield urlparse.urldefrag(urlparse.urljoin(response.url, i['href'], False))[0]
    except NotHtmlException:
        pass
    def job(self, joborder, basedir, output_callback, **kwargs):
        # Validate job order
        validate.validate_ex(self.names.get_name("input_record_schema", ""), joborder)

        requirements = kwargs.get("requirements", []) + self.tool.get("requirements", [])
        hints = kwargs.get("hints", []) + self.tool.get("hints", [])

        steps = [makeTool(step, basedir) for step in self.tool.get("steps", [])]
        random.shuffle(steps)

        self.state = {}
        self.processStatus = "success"
        for i in self.tool["inputs"]:
            (_, iid) = urlparse.urldefrag(i["id"])
            if iid in joborder:
                self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid]))
            elif "default" in i:
                self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"]))
            else:
                raise WorkflowException("Input '%s' not in input object and does not have a default value." % (i["id"]))

        for s in steps:
            for out in s.tool["outputs"]:
                self.state[out["id"]] = None
            s.completed = False

        completed = 0
        while completed < len(steps):
            made_progress = False
            completed = 0
            for step in steps:
                if step.completed:
                    completed += 1
                else:
                    for newjob in self.try_make_job(step, basedir, requirements=requirements, hints=hints, **kwargs):
                        if newjob:
                            made_progress = True
                            yield newjob
            if not made_progress and completed < len(steps):
                yield None

        wo = {}
        for i in self.tool["outputs"]:
            if "connect" in i:
                (_, src) = urlparse.urldefrag(i['id'])
                if i["connect"]["source"] not in self.state:
                    raise WorkflowException("Connect source '%s' on parameter '%s' does not exist" % (i["connect"]["source"], inp["id"]))
                wo[src] = self.state[i["connect"]["source"]].value

        output_callback(wo, self.processStatus)
Exemplo n.º 11
0
def crawl_web( scope, tocrawl, index, graph, url_info, limits = [-1, 0, 0.0, 1.0]): # returns index, graph of inlinks
    tocrawl_next = []    # used for depth control
    depth = 0
    pages = 0
    max_pages, max_depth, max_time, time_delay = limits

    if max_time > 0.0: start_time = time()
    while tocrawl or tocrawl_next:
        if not tocrawl:
            #
            #   Descent one more level (depth)
            #
            tocrawl = tocrawl_next
            tocrawl_next = []
            depth += 1
            if max_depth >= 0 and depth > max_depth:
                print 'Reached maximum depth. Interrupting crawler.'
                break
            
        page = tocrawl.pop(0)
        # Remove fragment portion from the url
        page = urlparse.urldefrag(page)[0]
        if not page in graph:
            pages += 1
            print 'Crawling page:', page
            if max_time != 0.0: print 'time = ', time()-start_time, ' max_time = ', max_time 
            if max_pages > 0:
                print 'Pages crawled:', pages, 'max_pages = ', max_pages

            # [ToDo:]Transform meta_data into a dictionary
            text, outlinks, meta_data = get_page( page)
            add_page_to_index( index, page, text)
            # Need to filter outlinks only to current scope
            outlinks = [ [urlparse.urldefrag(l[0])[0],l[1]] for l in outlinks if is_inscope( scope, l[0]) and (l[0].endswith('.html') or l[0].endswith('.htm')) ]
            newlinks = [ urlparse.urldefrag(l[0])[0] for l in outlinks]
            graph[page] = outlinks
            url_info[page] = meta_data
            tocrawl_next = list( set(tocrawl_next + newlinks))
            
            if pages >= max_pages:
                print 'Reached number of pages limit. Interrupting crawler.'
                break
            if max_time > 0.0 and max_time > time()-start_time:
                print 'Reached time limit. Interrupting crawler.'
                break

    tocrawl = list( set(tocrawl + tocrawl_next))
    return tocrawl, index, graph, url_info
Exemplo n.º 12
0
    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if not href:
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
                href = 'missing.html'

            href = item.abshref(urlnormalize(href[0]))
            path, _ = urldefrag(href)
            if path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(child,
                    'descendant::calibre:meta[@name = "author"]')
            if authorElement :
                author = authorElement[0].text
            else :
                author = None

            descriptionElement = xpath(child,
                    'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                method='text', encoding=unicode).strip()
                if not description:
                    description = None
            else :
                description = None

            index_image = xpath(child,
                    'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title, href, id=id, klass=klass,
                    play_order=po, description=description, author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)
Exemplo n.º 13
0
def load_path(db, path):
    basepath, fragment = urldefrag(path)

    with closing(urllib.urlopen(basepath)) as f:
        contents = f.read()

    # Is it a db pickle?
    try:
        source = DbSource(contents)
        for a, b, cs in source.word_lists():
            db.extend(a, b, cs)
        return
    except:
        pass


    # Is it a pdf?
    try:
        source = PdfSource(contents)
        for a, b, c in source.triples():
            db.append(a, b, c)
        return
    except:
        pass

    # treat it as text
    source = TextSource(contents, fragment)
    for a, b, c in source.triples():
        db.append(a, b, c)
    def __init__(self, toolpath_object, **kwargs):
        try:
            makeTool = kwargs.get("makeTool")
            self.embedded_tool = makeTool(toolpath_object["run"], **kwargs)
        except validate.ValidationException as v:
            raise WorkflowException("Tool definition %s failed validation:\n%s" % (toolpath_object["run"]["id"], validate.indent(str(v))))

        if "id" in toolpath_object:
            self.id = toolpath_object["id"]
        else:
            self.id = "#step_" + str(random.randint(1, 1000000000))

        for field in ("inputs", "outputs"):
            for i in toolpath_object[field]:
                inputid = i["id"]
                (_, d) = urlparse.urldefrag(inputid)
                frag = d.split(".")[-1]
                p = urlparse.urljoin(toolpath_object["run"].get("id", self.id), "#" + frag)
                found = False
                for a in self.embedded_tool.tool[field]:
                    if a["id"] == p:
                        i.update(a)
                        found = True
                if not found:
                    raise WorkflowException("Did not find %s parameter '%s' in workflow step" % (field, p))
                i["id"] = inputid

        super(WorkflowStep, self).__init__(toolpath_object, "Process", do_validate=False, **kwargs)

        if self.embedded_tool.tool["class"] == "Workflow":
            (feature, _) = self.get_requirement("SubworkflowFeatureRequirement")
            if not feature:
                raise WorkflowException("Workflow contains embedded workflow but SubworkflowFeatureRequirement not declared")
Exemplo n.º 15
0
    def __init__(self, request, timeout=180):
        self.url = urldefrag(request.url)[0]
        self.method = request.method
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response, request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
Exemplo n.º 16
0
	def grab_links(self):
		if self.document is not None:
			for item in self.document.xpath('//a/@href'):
				item = urldefrag(item)[0]
				url = urlparse(item)
				if url.geturl() and item not in self.crawler.visited_urls and url.hostname in self.processor.allowed_urls:
						self.crawler.urls.put(item)
Exemplo n.º 17
0
 def serialize_href(self, href, base=None):
     '''
     Serialize the href attribute of an <a> or <reference> tag. It is
     serialized as filepos="000000000" and a pointer to its location is
     stored in self.href_offsets so that the correct value can be filled in
     at the end.
     '''
     hrefs = self.oeb.manifest.hrefs
     try:
         path, frag = urldefrag(urlnormalize(href))
     except ValueError:
         # Unparseable URL
         return False
     if path and base:
         path = base.abshref(path)
     if path and path not in hrefs:
         return False
     buf = self.buf
     item = hrefs[path] if path else None
     if item and item.spine_position is None:
         return False
     path = item.href if item else base.href
     href = '#'.join((path, frag)) if frag else path
     buf.write(b'filepos=')
     self.href_offsets[href].append(buf.tell())
     buf.write(b'0000000000')
     return True
Exemplo n.º 18
0
    def write_opf(self, guide, toc, spine, resource_map):
        mi = self.header.exth.mi
        if (self.cover_offset is not None and self.cover_offset <
                len(resource_map)):
            mi.cover = resource_map[self.cover_offset]

        if len(list(toc)) < 2:
            self.log.warn('KF8 has no metadata Table of Contents')

            for ref in guide:
                if ref.type == 'toc':
                    href = ref.href()
                    href, frag = urldefrag(href)
                    if os.path.exists(href.replace('/', os.sep)):
                        try:
                            toc = self.read_inline_toc(href, frag)
                        except:
                            self.log.exception('Failed to read inline ToC')

        opf = OPFCreator(os.getcwdu(), mi)
        opf.guide = guide

        def exclude(path):
            return os.path.basename(path) == 'debug-raw.html'

        opf.create_manifest_from_files_in([os.getcwdu()], exclude=exclude)
        opf.create_spine(spine)
        opf.set_toc(toc)

        with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx:
            opf.render(of, ncx, 'toc.ncx')
        return 'metadata.opf'
Exemplo n.º 19
0
def validate_document(document_loader, workflowobj, uri,
                      enable_dev=False, strict=True, preprocess_only=False):
    # type: (Loader, Dict[unicode, Any], unicode, bool, bool, bool) -> Tuple[Loader, Names, Any, Dict[str, str], unicode]
    """Validate a CWL document."""
    jobobj = None
    if "cwl:tool" in workflowobj:
        jobobj = workflowobj
        uri = urlparse.urljoin(uri, jobobj["cwl:tool"])
        del jobobj["cwl:tool"]
        workflowobj = fetch_document(uri)[1]

    if isinstance(workflowobj, list):
        workflowobj = {
            "$graph": workflowobj
        }

    fileuri = urlparse.urldefrag(uri)[0]

    if "cwlVersion" in workflowobj:
        workflowobj["cwlVersion"] = re.sub(
            r"^(?:cwl:|https://w3id.org/cwl/cwl#)", "",
            workflowobj["cwlVersion"])
    else:
        workflowobj["cwlVersion"] = "draft-2"

    if workflowobj["cwlVersion"] == "draft-2":
        workflowobj = update._draft2toDraft3dev1(
            workflowobj, document_loader, uri, update_steps=False)
        if "@graph" in workflowobj:
            workflowobj["$graph"] = workflowobj["@graph"]
            del workflowobj["@graph"]

    (document_loader, avsc_names) = \
            process.get_schema(workflowobj["cwlVersion"])[:2]

    if isinstance(avsc_names, Exception):
        raise avsc_names

    workflowobj["id"] = fileuri
    processobj, metadata = document_loader.resolve_all(workflowobj, fileuri)

    if preprocess_only:
        return document_loader, avsc_names, processobj, metadata, uri

    document_loader.validate_links(processobj)
    schema.validate_doc(avsc_names, processobj, document_loader, strict)

    if not metadata:
        metadata = {"$namespaces": processobj.get("$namespaces", {}),
                    "$schemas": processobj.get("$schemas", []),
                    "cwlVersion": processobj["cwlVersion"]}

    if metadata.get("cwlVersion") != update.LATEST:
        processobj = update.update(
            processobj, document_loader, fileuri, enable_dev, metadata)

    if jobobj:
        metadata["cwl:defaults"] = jobobj

    return document_loader, avsc_names, processobj, metadata, uri
Exemplo n.º 20
0
    def serialize_guide(self):
        '''
        The Kindle decides where to open a book based on the presence of
        an item in the guide that looks like
        <reference type="text" title="Start" href="chapter-one.xhtml"/>

        Similarly an item with type="toc" controls where the Goto Table of
        Contents operation on the kindle goes.
        '''

        buf = self.buf
        hrefs = self.oeb.manifest.hrefs
        buf.write(b'<guide>')
        for ref in self.oeb.guide.values():
            path = urldefrag(ref.href)[0]
            if path not in hrefs or hrefs[path].media_type not in OEB_DOCS:
                continue

            buf.write(b'<reference type="')
            if ref.type.startswith('other.') :
                self.serialize_text(ref.type.replace('other.',''), quot=True)
            else:
                self.serialize_text(ref.type, quot=True)
            buf.write(b'" ')
            if ref.title is not None:
                buf.write(b'title="')
                self.serialize_text(ref.title, quot=True)
                buf.write(b'" ')
                if is_guide_ref_start(ref):
                    self._start_href = ref.href
            self.serialize_href(ref.href)
            # Space required or won't work, I kid you not
            buf.write(b' />')

        buf.write(b'</guide>')
Exemplo n.º 21
0
	def reduce_url(cls, url):
		"""
		>>> url = "qpfer://mother/qfid#module_and_type_hint"
		>>> qfurl.reduce_url(url)
		'qpfer://mother/qfid'
		"""
		return urlparse.urldefrag(url)[0].replace("///", "", 1)
Exemplo n.º 22
0
def make_tool(document_loader, avsc_names, processobj, metadata, uri, makeTool,
              kwargs):
    # type: (Loader, Names, Dict[str, Any], Dict[str, Any], unicode, Callable[..., Process], Dict[str, Any]) -> Process
    """Make a Python CWL object."""
    resolveduri = document_loader.resolve_ref(uri)[0]

    if isinstance(resolveduri, list):
        if len(resolveduri) == 1:
            processobj = resolveduri[0]
        else:
            raise WorkflowException(
                u"Tool file contains graph of multiple objects, must specify "
                "one of #%s" % ", #".join(
                    urlparse.urldefrag(i["id"])[1] for i in resolveduri
                    if "id" in i))
    else:
        processobj = cast(Dict[str, Any], resolveduri)

    kwargs = kwargs.copy()
    kwargs.update({
        "makeTool": makeTool,
        "loader": document_loader,
        "avsc_names": avsc_names,
        "metadata": metadata
    })
    tool = makeTool(processobj, **kwargs)

    if "cwl:defaults" in metadata:
        jobobj = metadata["cwl:defaults"]
        for inp in tool.tool["inputs"]:
            if shortname(inp["id"]) in jobobj:
                inp["default"] = jobobj[shortname(inp["id"])]

    return tool
Exemplo n.º 23
0
    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
 def receive_output(self, jobout, processStatus):
     _logger.debug("WorkflowStep output from run is %s", jobout)
     self.output = {}
     for i in self.tool["outputs"]:
         (_, d) = urlparse.urldefrag(i["param"] if "param" in i else i["id"])
         self.output[i["id"]] = jobout[d]
     self.processStatus = processStatus
Exemplo n.º 25
0
 def map_resources(self, oeb_book):
     for item in oeb_book.manifest:
         if item.media_type in OEB_IMAGES:
             if item.href not in self.images:
                 ext = os.path.splitext(item.href)[1]
                 fname = '%s%s' % (len(self.images), ext)
                 fname = fname.zfill(10)
                 self.images[item.href] = fname
         if item in oeb_book.spine:
             self.get_link_id(item.href)
             root = item.data.find(XHTML('body'))
             link_attrs = set(html.defs.link_attrs)
             link_attrs.add(XLINK('href'))
             for el in root.iter():
                 attribs = el.attrib
                 try:
                     if not isinstance(el.tag, basestring):
                         continue
                 except:
                     continue
                 for attr in attribs:
                     if attr in link_attrs:
                         href = item.abshref(attribs[attr])
                         href, id = urldefrag(href)
                         if href in self.base_hrefs:
                             self.get_link_id(href, id)
Exemplo n.º 26
0
    def handle_starttag(self, tag, attrs):
        """
        """
        self.html += "<%s" % tag

        for attr in attrs:
            if attr[0] == "href":
                try:
                    # split anchor from url
                    baseurl, anchor = urlparse.urldefrag(attr[1])
                    o = self.context.restrictedTraverse(
                        urllib.unquote(baseurl))
                    if getattr(o, 'absolute_url', None):
                        url = o.absolute_url()
                    else:
                        # maybe we got a view instead of an traversal object:
                        if getattr(o, 'context', None):
                            url = o.context.absolute_url()
                        else:
                            url = attr[1]
                    if anchor:
                        url = '#' + anchor
                except Exception:
                    url = attr[1]
                self.html += ' href="%s"' % self._encode(url)
            else:
                self.html += ' %s="%s"' % (attr)

        self.html += ">"
Exemplo n.º 27
0
def escape_ajax(url):
    """
    Return the crawleable url according to:
    http://code.google.com/web/ajaxcrawling/docs/getting-started.html

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key=value'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key=value'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key=value'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    defrag, frag = urlparse.urldefrag(url)
    if not frag.startswith('!'):
        return url
    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
Exemplo n.º 28
0
  def find_mention_item(self, items):
    """Returns the mf2 item that mentions (or replies to, likes, etc) the target.

    May modify the items arg, e.g. may set or replace content.html or
    content.value.

    Args:
      items: sequence of mf2 item dicts

    Returns:
      mf2 item dict or None
    """
    # find target URL in source
    for item in items:
      props = item.setdefault('properties', {})

      # find first non-empty content element
      content = props.setdefault('content', [{}])[0]
      text = content.get('html') or content.get('value')

      for type in 'in-reply-to', 'like', 'like-of', 'repost', 'repost-of':
        urls = [urlparse.urldefrag(u)[0] for u in
                microformats2.get_string_urls(props.get(type, []))]
        if self.any_target_in(urls):
          break
      else:
        if text and self.any_target_in(text):
          type = 'post'
          url = first_value(props, 'url') or self.source_url
          name = first_value(props, 'name') or first_value(props, 'summary')
          text = content['html'] = ('mentioned this in %s.' %
                                    util.pretty_link(url, text=name, max_length=280))
        else:
          type = None

      if type:
        # found the target!
        rsvp = first_value(props, 'rsvp')
        if rsvp:
          self.entity.type = 'rsvp'
          if not text:
            content['value'] = 'RSVPed %s.' % rsvp
        else:
          self.entity.type = {'in-reply-to': 'comment',
                              'like-of': 'like',
                              'repost-of': 'repost',
                              }.get(type, type)
          if not text:
            content['value'] = {'comment': 'replied to this.',
                                'like': 'liked this.',
                                'repost': 'reposted this.',
                                }[self.entity.type]
        return item

      # check children in case this is eg an h-feed
      found = self.find_mention_item(item.get('children', []))
      if found:
        return found

    return None
Exemplo n.º 29
0
 def _spine_add_extra(self):
     manifest = self.oeb.manifest
     spine = self.oeb.spine
     unchecked = set(spine)
     selector = XPath('h:body//h:a/@href')
     extras = set()
     while unchecked:
         new = set()
         for item in unchecked:
             if item.media_type not in OEB_DOCS:
                 # TODO: handle fallback chains
                 continue
             for href in selector(item.data):
                 href, _ = urldefrag(href)
                 if not href:
                     continue
                 try:
                     href = item.abshref(urlnormalize(href))
                 except ValueError:  # Malformed URL
                     continue
                 if href not in manifest.hrefs:
                     continue
                 found = manifest.hrefs[href]
                 if found.media_type not in OEB_DOCS or \
                    found in spine or found in extras:
                     continue
                 new.add(found)
         extras.update(new)
         unchecked = new
     version = int(self.oeb.version[0])
     for item in sorted(extras):
         if version >= 2:
             self.logger.warn(
                 'Spine-referenced file %r not in spine' % item.href)
         spine.add(item, linear=False)
Exemplo n.º 30
0
 def typefmt(self, tp, redirects, nbsp=False):
     # type: (Any, Dict[str, str], bool) -> Union[str, unicode]
     global primitiveType
     if isinstance(tp, list):
         if nbsp and len(tp) <= 3:
             return "&nbsp;|&nbsp;".join([self.typefmt(n, redirects) for n in tp])
         else:
             return " | ".join([self.typefmt(n, redirects) for n in tp])
     if isinstance(tp, dict):
         if tp["type"] == "https://w3id.org/cwl/salad#array":
             return "array&lt;%s&gt;" % (self.typefmt(tp["items"], redirects, nbsp=True))
         if tp["type"] in ("https://w3id.org/cwl/salad#record", "https://w3id.org/cwl/salad#enum"):
             frg = schema.avro_name(tp["name"])
             if tp["name"] in redirects:
                 return """<a href="%s">%s</a>""" % (redirects[tp["name"]], frg)
             elif tp["name"] in self.typemap:
                 return """<a href="#%s">%s</a>""" % (to_id(frg), frg)
             else:
                 return frg
         if isinstance(tp["type"], dict):
             return self.typefmt(tp["type"], redirects)
     else:
         if str(tp) in redirects:
             return """<a href="%s">%s</a>""" % (redirects[tp], redirects[tp])
         elif str(tp) in basicTypes:
             return """<a href="%s">%s</a>""" % (primitiveType, schema.avro_name(str(tp)))
         else:
             _, frg = urlparse.urldefrag(tp)
             if frg:
                 tp = frg
             return """<a href="#%s">%s</a>""" % (to_id(tp), tp)
Exemplo n.º 31
0
 def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[],
                    inhead=False, preserve=False):
     if not isinstance(elem.tag, basestring):
         # Don't emit any comments or raw entities
         return
     nsrmap = copy.copy(nsrmap)
     attrib = dict(elem.attrib)
     style = self.stylizer.style(elem) if self.stylizer else None
     for key, value in elem.nsmap.items():
         if value not in nsrmap or nsrmap[value] != key:
             xmlns = ('xmlns:' + key) if key else 'xmlns'
             attrib[xmlns] = value
         nsrmap[value] = key
     tag = prefixname(elem.tag, nsrmap)
     tag_offset = self.buf.tell()
     if tag == 'head':
         inhead = True
     flags = FLAG_OPENING
     if not elem.text and len(elem) == 0:
         flags |= FLAG_CLOSING
     if inhead:
         flags |= FLAG_HEAD
     if style and self.is_block(style):
         flags |= FLAG_BLOCK
     self.write(0, flags)
     tattrs = self.tattrs[0]
     if tag in self.tags:
         index = self.tags[tag]
         self.write(index)
         if self.tattrs[index]:
             tattrs = self.tattrs[index]
     else:
         self.write(FLAG_CUSTOM, len(tag)+1, tag)
     last_break = self.page_breaks[-1][0] if self.page_breaks else None
     if style and last_break != tag_offset \
        and style['page-break-before'] in PAGE_BREAKS:
         self.page_breaks.append((tag_offset, list(parents)))
     for attr, value in attrib.items():
         attr = prefixname(attr, nsrmap)
         if attr in ('href', 'src'):
             value = urlnormalize(value)
             path, frag = urldefrag(value)
             if self.item:
                 path = self.item.abshref(path)
             prefix = unichr(3)
             if path in self.manifest.hrefs:
                 prefix = unichr(2)
                 value = self.manifest.hrefs[path].id
                 if frag:
                     value = '#'.join((value, frag))
             value = prefix + value
         elif attr in ('id', 'name'):
             self.anchors.append((value, tag_offset))
         elif attr.startswith('ms--'):
             attr = '%' + attr[4:]
         elif tag == 'link' and attr == 'type' and value in OEB_STYLES:
             value = CSS_MIME
         if attr in tattrs:
             self.write(tattrs[attr])
         else:
             self.write(FLAG_CUSTOM, len(attr)+1, attr)
         try:
             self.write(ATTR_NUMBER, int(value)+1)
         except ValueError:
             self.write(len(value)+1, value)
     self.write(0)
     old_preserve = preserve
     if style:
         preserve = (style['white-space'] in ('pre', 'pre-wrap'))
     xml_space = elem.get(XML('space'))
     if xml_space == 'preserve':
         preserve = True
     elif xml_space == 'normal':
         preserve = False
     if elem.text:
         if preserve:
             self.write(elem.text)
         elif len(elem) == 0 or not elem.text.isspace():
             self.write(COLLAPSE.sub(' ', elem.text))
         # else: de nada
     parents.append(tag_offset)
     child = cstyle = nstyle = None
     for next in chain(elem, [None]):
         if self.stylizer:
             nstyle = None if next is None else self.stylizer.style(next)
         if child is not None:
             if not preserve \
                and (inhead or not nstyle
                     or self.is_block(cstyle)
                     or self.is_block(nstyle)) \
                and child.tail and child.tail.isspace():
                 child.tail = None
             self.tree_to_binary(child, nsrmap, parents, inhead, preserve)
         child, cstyle = next, nstyle
     parents.pop()
     preserve = old_preserve
     if not flags & FLAG_CLOSING:
         self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0)
     if elem.tail and tag != 'html':
         tail = elem.tail
         if not preserve:
             tail = COLLAPSE.sub(' ', tail)
         self.write(tail)
     if style and style['page-break-after'] not in ('avoid', 'auto'):
         self.page_breaks.append((self.buf.tell(), list(parents)))
Exemplo n.º 32
0
 def spine_item(tocitem):
     href = urldefrag(tocitem.href)[0]
     for item in self.oeb.spine:
         if item.href == href:
             return item
Exemplo n.º 33
0
def submit_video(request):
    sitelocation = SiteLocation.objects.get_current()
    if not (request.user_is_admin() or sitelocation.display_submit_button):
        raise Http404

    # Extract construction hint, if it exists.
    # This is a hint that plugins can use to slightly change the behavior
    # of the video submission forms.
    construction_hint = (request.POST.get('construction_hint', None) or
                         request.GET.get('construction_hint', None))

    url = request.POST.get('url') or request.GET.get('url', '')

    if request.method == "GET" and not url:
        submit_form = forms.SubmitVideoForm(
            construction_hint=construction_hint)
        return render_to_response(
            'localtv/submit_video/submit.html',
            {'form': submit_form},
            context_instance=RequestContext(request))
    else:
        url = urlparse.urldefrag(url)[0]
        submit_form = forms.SubmitVideoForm({'url': url or ''})
        if submit_form.is_valid():
            existing = Video.objects.filter(
                Q(website_url=submit_form.cleaned_data['url']) |
                Q(file_url=submit_form.cleaned_data['url']),
                site=sitelocation.site)
            existing.filter(status=Video.REJECTED).delete()
            if existing.count():
                if request.user_is_admin():
                    # even if the video was rejected, an admin submitting it
                    # should make it approved
                    # FIXME: This initiates a new query against the database -
                    # so the rejected videos which were deleted will not be
                    # marked approved.
                    for v in existing.exclude(
                        status=Video.ACTIVE):
                        v.user = request.user
                        v.status = Video.ACTIVE
                        v.when_approved = datetime.datetime.now()
                        v.save()
                    return HttpResponseRedirect(
                        reverse('localtv_submit_thanks',
                                args=[existing[0].pk]))
                else:
                    # pick the first approved video to point the user at
                    videos = existing.filter(status=Video.ACTIVE)
                    if videos.count():
                        video = videos[0]
                    else:
                        video = None
                    return render_to_response(
                        'localtv/submit_video/submit.html',
                        {'form': forms.SubmitVideoForm(
                                construction_hint=construction_hint),
                         'was_duplicate': True,
                         'video': video},
                        context_instance=RequestContext(request))

            vidscraper_video = utils.get_vidscraper_video(
                submit_form.cleaned_data['url'])

            get_dict = {'url': submit_form.cleaned_data['url']}
            if 'construction_hint' in request.GET:
                get_dict['construction_hint'] = construction_hint
            if 'bookmarklet' in request.GET:
                get_dict['bookmarklet'] = '1'
            get_params = urllib.urlencode(get_dict)
            if vidscraper_video:
                if (vidscraper_video.link and
                    vidscraper_video.link != get_dict['url']):
                    request.POST = {
                        'url': vidscraper_video.link.encode('utf8')}
                    # rerun the view, but with the canonical URL
                    return submit_video(request)

                if (vidscraper_video.embed_code
                    or (vidscraper_video.file_url
                        and not vidscraper_video.file_url_expires)):
                    return HttpResponseRedirect(
                        reverse('localtv_submit_scraped_video') + '?' +
                        get_params)

            # otherwise if it looks like a video file
            if is_video_url(submit_form.cleaned_data['url']):
                return HttpResponseRedirect(
                    reverse('localtv_submit_directlink_video')
                    + '?' + get_params)
            else:
                return HttpResponseRedirect(
                    reverse('localtv_submit_embedrequest_video')
                    + '?' + get_params)

        else:
            return render_to_response(
                'localtv/submit_video/submit.html',
                {'form': submit_form},
                context_instance=RequestContext(request))
Exemplo n.º 34
0
print('scheme = ', p.scheme)  #Print the scheme parmeter from the result

#page 140
print(parse_qs(p.query))  # {'shape': ['square'], 'dpi': ['96']}
print("Query Parmeters:  ")

r = parse_qs('mode=topographic&pin=Boston&pin=San%20Francisco')
print(r)  # r is a dictinary

import pprint
pp = pprint.PrettyPrinter(indent=8)
print("Query Parmeters using pprint :  ")
pp.pprint(r)
#Remove the anchor (#),
u = 'http://docs.python.org/library/urlparse.html#item22'
udfrag = urldefrag(u)  # the retrun type is a tuple
#('http://docs.python.org/library/urlparse.html', 'urlparse.urldefrag')

print("URL defrag :")

length = len(udfrag)  # Get the number of items in a udfrag Tuple
print("Tuple Length :", length)
print(udfrag)  # ('http://docs.python.org/library/urlparse.html','item22'')
print(udfrag[0])  # http://docs.python.org/library/urlparse.html
print(udfrag[1])  # 'item22'

print("slice", udfrag[0:length])
print "this is a tuple: %s" % (udfrag, )  # Another way to print a tuple

# Build a URL by calling its geturl() method.
#  When combined with the urlencode() function, which knows how to build
Exemplo n.º 35
0
def joinUrls(baseUrl, newUrl):
	helpUrl, fragment = urlparse.urldefrag(newUrl)
        return urlparse.urljoin(baseUrl, helpUrl)
Exemplo n.º 36
0
    def __init__(self, toolpath_object, **kwargs):
        (_, self.names, _) = get_schema()
        self.tool = toolpath_object
        self.requirements = kwargs.get("requirements", []) + self.tool.get(
            "requirements", [])
        self.hints = kwargs.get("hints", []) + self.tool.get("hints", [])
        if "loader" in kwargs:
            self.formatgraph = kwargs["loader"].graph

        self.validate_hints(self.tool.get("hints", []),
                            strict=kwargs.get("strict"))

        self.schemaDefs = {}

        sd, _ = self.get_requirement("SchemaDefRequirement")

        if sd:
            sdtypes = sd["types"]
            av = schema_salad.schema.make_valid_avro(
                sdtypes, {t["name"]: t
                          for t in sdtypes}, set())
            for i in av:
                self.schemaDefs[i["name"]] = i
            avro.schema.make_avsc_object(av, self.names)

        # Build record schema from inputs
        self.inputs_record_schema = {
            "name": "input_record_schema",
            "type": "record",
            "fields": []
        }
        self.outputs_record_schema = {
            "name": "outputs_record_schema",
            "type": "record",
            "fields": []
        }

        for key in ("inputs", "outputs"):
            for i in self.tool[key]:
                c = copy.copy(i)
                doc_url, _ = urlparse.urldefrag(c['id'])
                c["name"] = shortname(c["id"])
                del c["id"]

                if "type" not in c:
                    raise validate.ValidationException(
                        "Missing `type` in parameter `%s`" % c["name"])

                if "default" in c and "null" not in aslist(c["type"]):
                    c["type"] = ["null"] + aslist(c["type"])
                else:
                    c["type"] = c["type"]

                if key == "inputs":
                    self.inputs_record_schema["fields"].append(c)
                elif key == "outputs":
                    self.outputs_record_schema["fields"].append(c)

        try:
            self.inputs_record_schema = schema_salad.schema.make_valid_avro(
                self.inputs_record_schema, {}, set())
            avro.schema.make_avsc_object(self.inputs_record_schema, self.names)
        except avro.schema.SchemaParseException as e:
            raise validate.ValidationException(
                "Got error `%s` while prcoessing inputs of %s:\n%s" %
                (str(e), self.tool["id"],
                 json.dumps(self.inputs_record_schema, indent=4)))

        try:
            self.outputs_record_schema = schema_salad.schema.make_valid_avro(
                self.outputs_record_schema, {}, set())
            avro.schema.make_avsc_object(self.outputs_record_schema,
                                         self.names)
        except avro.schema.SchemaParseException as e:
            raise validate.ValidationException(
                "Got error `%s` while prcoessing outputs of %s:\n%s" %
                (str(e), self.tool["id"],
                 json.dumps(self.outputs_record_schema, indent=4)))
Exemplo n.º 37
0
def normalize(seed_url, link):
    # urldefrag 从链接中的第一个#开始,将链接分割成两部分,第二部分为#之后数据且不带#
    link, _ = urlparse.urldefrag(link)
    return urlparse.urljoin(seed_url, link)
Exemplo n.º 38
0
    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]' % frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href', False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x: i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans
Exemplo n.º 39
0
    def analyze(self, fname, find_sources=False, check_remote=False):
        """Analyze links on a page."""
        rv = False
        self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
        self.internal_redirects = [urljoin('/', _[0]) for _ in self.site.config['REDIRECTIONS']]
        base_url = urlparse(self.site.config['BASE_URL'])
        self.existing_targets.add(self.site.config['SITE_URL'])
        self.existing_targets.add(self.site.config['BASE_URL'])
        url_type = self.site.config['URL_TYPE']
        atom_extension = self.site.config['ATOM_EXTENSION']

        deps = {}
        if find_sources:
            deps = _call_nikola_list(self.site, self.cache)[1]

        if url_type in ('absolute', 'full_path'):
            url_netloc_to_root = urlparse(self.site.config['BASE_URL']).path
        try:
            filename = fname

            if filename.startswith(self.site.config['CACHE_FOLDER']):
                # Do not look at links in the cache, which are not parsed by
                # anyone and may result in false positives.  Problems arise
                # with galleries, for example.  Full rationale: (Issue #1447)
                self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename))
                return False

            if not os.path.exists(fname):
                # Quietly ignore files that don’t exist; use `nikola check -f` instead (Issue #1831)
                return False

            if '.html' == fname[-5:]:
                with open(filename, 'rb') as inf:
                    d = lxml.html.fromstring(inf.read())
                extra_objs = lxml.html.fromstring('<html/>')

                # Turn elements with a srcset attribute into individual img elements with src attributes
                for obj in list(d.xpath('(*//img|*//source)')):
                    if 'srcset' in obj.attrib:
                        for srcset_item in obj.attrib['srcset'].split(','):
                            extra_objs.append(lxml.etree.Element('img', src=srcset_item.strip().split(' ')[0]))
                link_elements = list(d.iterlinks()) + list(extra_objs.iterlinks())
            # Extract links from XML formats to minimal HTML, allowing those to go through the link checks
            elif atom_extension == filename[-len(atom_extension):]:
                d = lxml.etree.parse(filename)
                link_elements = lxml.html.fromstring('<html/>')
                for elm in d.findall('*//{http://www.w3.org/2005/Atom}link'):
                    feed_link = elm.attrib['href'].split('?')[0].strip()  # strip FEED_LINKS_APPEND_QUERY
                    link_elements.append(lxml.etree.Element('a', href=feed_link))
                link_elements = list(link_elements.iterlinks())
            elif filename.endswith('sitemap.xml') or filename.endswith('sitemapindex.xml'):
                d = lxml.etree.parse(filename)
                link_elements = lxml.html.fromstring('<html/>')
                for elm in d.getroot().findall("*//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
                    link_elements.append(lxml.etree.Element('a', href=elm.text.strip()))
                link_elements = list(link_elements.iterlinks())
            else:  # unsupported file type
                return False

            for l in link_elements:
                target = l[2]
                if target == "#":
                    continue
                target = urldefrag(target)[0]

                if any([urlparse(target).netloc.endswith(_) for _ in ['example.com', 'example.net', 'example.org']]):
                    self.logger.debug("Not testing example address \"{0}\".".format(target))
                    continue

                # absolute URL to root-relative
                if target.startswith(base_url.geturl()):
                    target = target.replace(base_url.geturl(), '/')

                parsed = urlparse(target)

                # Warn about links from https to http (mixed-security)
                if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http":
                    self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target))

                # Link to an internal REDIRECTIONS page
                if target in self.internal_redirects:
                    redir_status_code = 301
                    redir_target = [_dest for _target, _dest in self.site.config['REDIRECTIONS'] if urljoin('/', _target) == target][0]
                    self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: 301]".format(redir_target, filename, target))

                # Absolute links to other domains, skip
                # Absolute links when using only paths, skip.
                if ((parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc) or \
                        ((parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path')):
                    if not check_remote or parsed.scheme not in ["http", "https"]:
                        continue
                    if target in self.checked_remote_targets:  # already checked this exact target
                        if self.checked_remote_targets[target] in [301, 308]:
                            self.logger.warn("Remote link PERMANENTLY redirected in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target]))
                        elif self.checked_remote_targets[target] in [302, 307]:
                            self.logger.debug("Remote link temporarily redirected in {0}: {1} [HTTP: {2}]".format(filename, target, self.checked_remote_targets[target]))
                        elif self.checked_remote_targets[target] > 399:
                            self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, self.checked_remote_targets[target]))
                        continue

                    # Skip whitelisted targets
                    if any(re.search(_, target) for _ in self.whitelist):
                        continue

                    # Check the remote link works
                    req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0 (Nikola)'}  # I’m a real boy!
                    resp = requests.head(target, headers=req_headers, allow_redirects=False)

                    # Retry client errors (4xx) as GET requests because many servers are broken
                    if resp.status_code >= 400 and resp.status_code <= 499:
                        time.sleep(0.5)
                        resp = requests.get(target, headers=req_headers, allow_redirects=False)

                    # Follow redirects and see where they lead, redirects to errors will be reported twice
                    if resp.status_code in [301, 302, 307, 308]:
                        redir_status_code = resp.status_code
                        time.sleep(0.5)
                        # Known redirects are retested using GET because IIS servers otherwise get HEADaches
                        resp = requests.get(target, headers=req_headers, allow_redirects=True)
                        # Permanent redirects should be updated
                        if redir_status_code in [301, 308]:
                            self.logger.warn("Remote link moved PERMANENTLY to \"{0}\" and should be updated in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
                        if redir_status_code in [302, 307]:
                            self.logger.debug("Remote link temporarily redirected to \"{0}\" in {1}: {2} [HTTP: {3}]".format(resp.url, filename, target, redir_status_code))
                        self.checked_remote_targets[resp.url] = resp.status_code
                        self.checked_remote_targets[target] = redir_status_code
                    else:
                        self.checked_remote_targets[target] = resp.status_code

                    if resp.status_code > 399:  # Error
                        self.logger.error("Broken link in {0}: {1} [Error {2}]".format(filename, target, resp.status_code))
                        continue
                    elif resp.status_code <= 399:  # The address leads *somewhere* that is not an error
                        self.logger.debug("Successfully checked remote link in {0}: {1} [HTTP: {2}]".format(filename, target, resp.status_code))
                        continue
                    self.logger.warn("Could not check remote link in {0}: {1} [Unknown problem]".format(filename, target))
                    continue

                if url_type == 'rel_path':
                    if target.startswith('/'):
                        target_filename = os.path.abspath(
                            os.path.join(self.site.config['OUTPUT_FOLDER'], unquote(target.lstrip('/'))))
                    else:  # Relative path
                        unquoted_target = unquote(target).encode('utf-8')
                        target_filename = os.path.abspath(
                            os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target))

                elif url_type in ('full_path', 'absolute'):
                    relative = False
                    if url_type == 'absolute':
                        # convert to 'full_path' case, ie url relative to root
                        if parsed.path.startswith(url_netloc_to_root):
                            url_rel_path = parsed.path[len(url_netloc_to_root):]
                        else:
                            url_rel_path = parsed.path
                            if not url_rel_path.startswith('/'):
                                relative = True
                    else:
                        # convert to relative to base path
                        if target.startswith(url_netloc_to_root):
                            url_rel_path = target[len(url_netloc_to_root):]
                        else:
                            url_rel_path = target
                            if not url_rel_path.startswith('/'):
                                relative = True
                    if url_rel_path == '' or url_rel_path.endswith('/'):
                        url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE'])
                    if relative:
                        unquoted_target = unquote(target).encode('utf-8')
                        target_filename = os.path.abspath(
                            os.path.join(os.path.dirname(filename).encode('utf-8'), unquoted_target))
                    else:
                        fs_rel_path = fs_relpath_from_url_path(url_rel_path)
                        target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)

                if any(re.search(x, target_filename) for x in self.whitelist):
                    continue

                elif target_filename not in self.existing_targets:
                    if os.path.exists(target_filename):
                        self.logger.info("Good link {0} => {1}".format(target, target_filename))
                        self.existing_targets.add(target_filename)
                    else:
                        rv = True
                        self.logger.warn("Broken link in {0}: {1}".format(filename, target))
                        if find_sources:
                            self.logger.warn("Possible sources:")
                            self.logger.warn("\n".join(deps[filename]))
                            self.logger.warn("===============================\n")
        except Exception as exc:
            self.logger.error(u"Error with: {0} {1}".format(filename, exc))
        return rv
Exemplo n.º 40
0
def linkto(item):
    _, frg = urlparse.urldefrag(item)
    return "[%s](#%s)" % (frg, to_id(frg))
Exemplo n.º 41
0
    def _manifest_add_missing(self, invalid):
        import cssutils
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS|OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception(u'Failed to read from manifest '
                                u'entry with id: %s, ignoring'%item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r'%href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(cssutils.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set([])
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' % href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)
Exemplo n.º 42
0
 def get_src_obj_url(url):
     """Get and return source object's URL part from full URL."""
     return urldefrag(url)[0]
Exemplo n.º 43
0
def urldefrag(url):
    url, fragment = _urlparse.urldefrag(url)
    return urlnormalize(url), fragment
Exemplo n.º 44
0
    def render_type(self, f, depth):  # type: (Dict[str, Any], int) -> None
        if f["name"] in self.rendered or f["name"] in self.redirects:
            return
        self.rendered.add(f["name"])

        if "doc" not in f:
            f["doc"] = ""

        f["type"] = copy.deepcopy(f)
        f["doc"] = ""
        f = f["type"]

        if "doc" not in f:
            f["doc"] = ""

        def extendsfrom(item, ex):
            # type: (Dict[str, Any], List[Dict[str, Any]]) -> None
            if "extends" in item:
                for e in aslist(item["extends"]):
                    ex.insert(0, self.typemap[e])
                    extendsfrom(self.typemap[e], ex)

        ex = [f]
        extendsfrom(f, ex)

        enumDesc = {}
        if f["type"] == "enum" and isinstance(f["doc"], list):
            for e in ex:
                for i in e["doc"]:
                    idx = i.find(":")
                    if idx > -1:
                        enumDesc[i[:idx]] = i[idx + 1:]
                e["doc"] = [
                    i for i in e["doc"]
                    if i.find(":") == -1 or i.find(" ") < i.find(":")
                ]

        f["doc"] = fix_doc(f["doc"])

        if f["type"] == "record":
            for field in f.get("fields", []):
                if "doc" not in field:
                    field["doc"] = ""

        if f["type"] != "documentation":
            lines = []
            for l in f["doc"].splitlines():
                if len(l) > 0 and l[0] == "#":
                    l = ("#" * depth) + l
                lines.append(l)
            f["doc"] = "\n".join(lines)

            _, frg = urlparse.urldefrag(f["name"])
            num = self.toc.add_entry(depth, frg)
            doc = "## %s %s\n" % (num, frg)
        else:
            doc = ""

        if self.title is None and f["doc"]:
            self.title = f["doc"][0:f["doc"].index("\n")]
            if self.title.startswith('# '):
                self.title = self.title[2:]

        if f["type"] == "documentation":
            f["doc"] = number_headings(self.toc, f["doc"])

        # if "extends" in f:
        #    doc += "\n\nExtends "
        #    doc += ", ".join([" %s" % linkto(ex) for ex in aslist(f["extends"])])
        # if f["name"] in self.subs:
        #    doc += "\n\nExtended by"
        #    doc += ", ".join([" %s" % linkto(s) for s in self.subs[f["name"]]])
        # if f["name"] in self.uses:
        #    doc += "\n\nReferenced by"
        #    doc += ", ".join([" [%s.%s](#%s)" % (s[0], s[1], to_id(s[0]))
        #       for s in self.uses[f["name"]]])

        doc = doc + "\n\n" + f["doc"]

        doc = mistune.markdown(doc, renderer=MyRenderer())

        if f["type"] == "record":
            doc += "<h3>Fields</h3>"
            doc += """<table class="table table-striped">"""
            doc += "<tr><th>field</th><th>type</th><th>required</th><th>description</th></tr>"
            required = []
            optional = []
            for i in f.get("fields", []):
                tp = i["type"]
                if isinstance(
                        tp,
                        list) and tp[0] == "https://w3id.org/cwl/salad#null":
                    opt = False
                    tp = tp[1:]
                else:
                    opt = True

                desc = i["doc"]
                # if "inherited_from" in i:
                #    desc = "%s _Inherited from %s_" % (desc, linkto(i["inherited_from"]))

                rfrg = schema.avro_name(i["name"])
                tr = "<td><code>%s</code></td><td>%s</td><td>%s</td>"\
                    "<td>%s</td>" % (
                        rfrg, self.typefmt(tp, self.redirects), opt,
                        mistune.markdown(desc))
                if opt:
                    required.append(tr)
                else:
                    optional.append(tr)
            for i in required + optional:
                doc += "<tr>" + i + "</tr>"
            doc += """</table>"""
        elif f["type"] == "enum":
            doc += "<h3>Symbols</h3>"
            doc += """<table class="table table-striped">"""
            doc += "<tr><th>symbol</th><th>description</th></tr>"
            for e in ex:
                for i in e.get("symbols", []):
                    doc += "<tr>"
                    efrg = schema.avro_name(i)
                    doc += "<td><code>%s</code></td><td>%s</td>" % (
                        efrg, enumDesc.get(efrg, ""))
                    doc += "</tr>"
            doc += """</table>"""
        f["doc"] = doc

        self.typedoc.write(f["doc"])

        subs = self.docParent.get(f["name"], []) + \
            self.record_refs.get(f["name"], [])
        if len(subs) == 1:
            self.render_type(self.typemap[subs[0]], depth)
        else:
            for s in subs:
                self.render_type(self.typemap[s], depth + 1)

        for s in self.docAfter.get(f["name"], []):
            self.render_type(self.typemap[s], depth)
Exemplo n.º 45
0
    def __init__(self,
                 toolpath_object,
                 validateAs,
                 do_validate=True,
                 **kwargs):
        (_, self.names) = get_schema()
        self.tool = toolpath_object

        if do_validate:
            try:
                # Validate tool documument
                validate.validate_ex(self.names.get_name(validateAs, ""),
                                     self.tool,
                                     strict=kwargs.get("strict"))
            except validate.ValidationException as v:
                raise validate.ValidationException(
                    "Could not validate %s as %s:\n%s" %
                    (self.tool.get("id"), validateAs, validate.indent(str(v))))

        self.requirements = kwargs.get("requirements", []) + self.tool.get(
            "requirements", [])
        self.hints = kwargs.get("hints", []) + self.tool.get("hints", [])

        self.validate_hints(self.tool.get("hints", []),
                            strict=kwargs.get("strict"))

        self.schemaDefs = {}

        sd, _ = self.get_requirement("SchemaDefRequirement")

        if sd:
            for i in sd["types"]:
                avro.schema.make_avsc_object(i, self.names)
                self.schemaDefs[i["name"]] = i

        # Build record schema from inputs
        self.inputs_record_schema = {
            "name": "input_record_schema",
            "type": "record",
            "fields": []
        }
        for i in self.tool["inputs"]:
            c = copy.copy(i)
            doc_url, fragment = urlparse.urldefrag(c['id'])
            c["name"] = fragment
            del c["id"]

            if "type" not in c:
                raise validate.ValidationException(
                    "Missing `type` in parameter `%s`" % c["name"])

            if "default" in c:
                c["type"] = ["null"] + aslist(c["type"])
            else:
                c["type"] = c["type"]
            self.inputs_record_schema["fields"].append(c)

        avro.schema.make_avsc_object(self.inputs_record_schema, self.names)

        self.outputs_record_schema = {
            "name": "outputs_record_schema",
            "type": "record",
            "fields": []
        }
        for i in self.tool["outputs"]:
            c = copy.copy(i)
            doc_url, fragment = urlparse.urldefrag(c['id'])
            c["name"] = fragment
            del c["id"]

            if "type" not in c:
                raise validate.ValidationException(
                    "Missing `type` in parameter `%s`" % c["name"])

            if "default" in c:
                c["type"] = ["null"] + aslist(c["type"])
            else:
                c["type"] = c["type"]
            self.outputs_record_schema["fields"].append(c)

        avro.schema.make_avsc_object(self.outputs_record_schema, self.names)
Exemplo n.º 46
0
The urlsplit() function is an alternative to urlparse() . It behaves a little
differently because it does not split the parameters from the URL. 
This is useful for URLs following RFC 2396, which supports parameters for each segment of the path.
Since the parameters are not split out, the tuple API will show five elements instead
of six, and there is no params attribute.
"""
from urlparse import urlsplit
url = 'http://*****:*****@NetLoc:80/p1;param/p2;param?query=arg#frag'
parsed = urlsplit(url)
print "using urlsplit=", parsed
print 'scheme :', parsed.scheme
print 'netloc :', parsed.netloc
print 'path:', parsed.path
print 'query :', parsed.query
print 'fragment:', parsed.fragment
print 'username:'******'password:'******'hostname:', parsed.hostname, '(netloc in lowercase)'
print 'port:', parsed.port

print "============================================"
"""
To simply strip the fragment identifier from a URL, 
such as when finding a base page name from a URL, use urldefrag() .
"""
from urlparse import urldefrag
original = 'http://netloc/path;param?query=arg#frag'
print 'original:', original
url, fragment = urldefrag(original)
print 'url:', url
print 'fragment:', fragment
Exemplo n.º 47
0
    def resolve_ref(
            self,
            ref,  # type: Union[CommentedMap, CommentedSeq, unicode]
            base_url=None,  # type: unicode
            checklinks=True  # type: bool
    ):
        # type: (...) -> Tuple[Union[CommentedMap, CommentedSeq, unicode], Dict[unicode, Any]]

        obj = None  # type: CommentedMap
        resolved_obj = None  # type: Union[CommentedMap, CommentedSeq, unicode]
        inc = False
        mixin = None  # type: Dict[unicode, Any]

        if not base_url:
            base_url = file_uri(os.getcwd()) + "/"

        if isinstance(ref, (str, unicode)) and os.sep == "\\":
            # Convert Windows path separator in ref
            ref = ref.replace("\\", "/")

        sl = SourceLine(obj, None, ValueError)
        # If `ref` is a dict, look for special directives.
        if isinstance(ref, CommentedMap):
            obj = ref
            if "$import" in obj:
                sl = SourceLine(obj, "$import", RuntimeError)
                if len(obj) == 1:
                    ref = obj[u"$import"]
                    obj = None
                else:
                    raise sl.makeError(
                        u"'$import' must be the only field in %s" %
                        (unicode(obj)))
            elif "$include" in obj:
                sl = SourceLine(obj, "$include", RuntimeError)
                if len(obj) == 1:
                    ref = obj[u"$include"]
                    inc = True
                    obj = None
                else:
                    raise sl.makeError(
                        u"'$include' must be the only field in %s" %
                        (unicode(obj)))
            elif "$mixin" in obj:
                sl = SourceLine(obj, "$mixin", RuntimeError)
                ref = obj[u"$mixin"]
                mixin = obj
                obj = None
            else:
                ref = None
                for identifier in self.identifiers:
                    if identifier in obj:
                        ref = obj[identifier]
                        break
                if not ref:
                    raise sl.makeError(
                        u"Object `%s` does not have identifier field in %s" %
                        (relname(obj), self.identifiers))

        if not isinstance(ref, (str, unicode)):
            raise ValueError(u"Expected CommentedMap or string, got %s: `%s`" %
                             (type(ref), unicode(ref)))

        url = self.expand_url(ref, base_url, scoped_id=(obj is not None))
        # Has this reference been loaded already?
        if url in self.idx and (not mixin):
            return self.idx[url], {}

        sl.raise_type = RuntimeError
        with sl:
            # "$include" directive means load raw text
            if inc:
                return self.fetch_text(url), {}

            doc = None
            if obj:
                for identifier in self.identifiers:
                    obj[identifier] = url
                doc_url = url
            else:
                # Load structured document
                doc_url, frg = urlparse.urldefrag(url)
                if doc_url in self.idx and (not mixin):
                    # If the base document is in the index, it was already loaded,
                    # so if we didn't find the reference earlier then it must not
                    # exist.
                    raise validate.ValidationException(
                        u"Reference `#%s` not found in file `%s`." %
                        (frg, doc_url))
                doc = self.fetch(doc_url, inject_ids=(not mixin))

        # Recursively expand urls and resolve directives
        if mixin:
            doc = copy.deepcopy(doc)
            doc.update(mixin)
            del doc["$mixin"]
            url = None
            resolved_obj, metadata = self.resolve_all(doc,
                                                      base_url,
                                                      file_base=doc_url,
                                                      checklinks=checklinks)
        else:
            resolved_obj, metadata = self.resolve_all(doc if doc else obj,
                                                      doc_url,
                                                      checklinks=checklinks)

        # Requested reference should be in the index now, otherwise it's a bad
        # reference
        if url is not None:
            if url in self.idx:
                resolved_obj = self.idx[url]
            else:
                raise RuntimeError(
                    "Reference `%s` is not in the index. Index contains:\n  %s"
                    % (url, "\n  ".join(self.idx)))

        if isinstance(resolved_obj, CommentedMap):
            if u"$graph" in resolved_obj:
                metadata = _copy_dict_without_key(resolved_obj, u"$graph")
                return resolved_obj[u"$graph"], metadata
            else:
                return resolved_obj, metadata
        else:
            return resolved_obj, metadata
Exemplo n.º 48
0
def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)
    def resolve_ref(self, ref, base_url=None, checklinks=True):
        # type: (Union[Dict[unicode, Any], unicode], unicode, bool) -> Tuple[Union[List, Dict[unicode, Any], unicode], Dict[unicode, Any]]
        base_url = base_url or u'file://%s/' % os.path.abspath('.')

        obj = None  # type: Dict[unicode, Any]
        inc = False

        # If `ref` is a dict, look for special directives.
        if isinstance(ref, dict):
            obj = ref
            if u"$import" in ref:
                if len(obj) == 1:
                    ref = obj[u"$import"]
                    obj = None
                else:
                    raise ValueError("'$import' must be the only field in %s" %
                                     (str(obj)))
            elif u"$include" in obj:
                if len(obj) == 1:
                    ref = obj[u"$include"]
                    inc = True
                    obj = None
                else:
                    raise ValueError(
                        "'$include' must be the only field in %s" % (str(obj)))
            else:
                ref = None
                for identifier in self.identifiers:
                    if identifier in obj:
                        ref = obj[identifier]
                        break
                if not ref:
                    raise ValueError(
                        "Object `%s` does not have identifier field in %s" %
                        (obj, self.identifiers))

        if not isinstance(ref, (str, unicode)):
            raise ValueError("Must be string: `%s`" % str(ref))

        url = self.expand_url(ref, base_url, scoped_id=(obj is not None))

        # Has this reference been loaded already?
        if url in self.idx:
            return self.idx[url], {}

        # "$include" directive means load raw text
        if inc:
            return self.fetch_text(url), {}

        doc = None
        if obj:
            for identifier in self.identifiers:
                obj[identifier] = url
            doc_url = url
        else:
            # Load structured document
            doc_url, frg = urlparse.urldefrag(url)
            if doc_url in self.idx:
                raise validate.ValidationException(
                    "Reference `#%s` not found in file `%s`." % (frg, doc_url))
            doc = self.fetch(doc_url)

        # Recursively expand urls and resolve directives
        resolved_obj, metadata = self.resolve_all(doc if doc else obj,
                                                  doc_url,
                                                  checklinks=checklinks)

        # Requested reference should be in the index now, otherwise it's a bad
        # reference
        if url is not None:
            if url in self.idx:
                resolved_obj = self.idx[url]
            else:
                raise RuntimeError("Reference `%s` is not in the index. "
                                   "Index contains:\n  %s" %
                                   (url, "\n  ".join(self.idx)))

        if isinstance(resolved_obj, (dict)):
            if u"$graph" in resolved_obj:
                metadata = _copy_dict_without_key(resolved_obj, u"$graph")
                return resolved_obj[u"$graph"], metadata
            else:
                return resolved_obj, metadata
        else:
            return resolved_obj, metadata
Exemplo n.º 50
0
def normalize(seed_url, link):
    link, _ = urlparse.urldefrag(link)
    return urlparse.urljoin(seed_url, link)
    def storage_volume_deletable(self, request):
        """
		Returns a list of domains that use the given volume.

		options: [{
			'domainURI': <domain URI>,
			'pool': <pool name>,
			'source': <file name>
			}, ...]

		return: [{
			'domainURI': <domain URI>,
			'pool': <pool name>,
			'source': <file name>
			'deletable': (True|False|None)
			}, ...]

		where 'deletebale' is
			True: disk can be deleted
			False: disk is shared and should not be deleted
			None: disk can not be deleted
		"""
        _tmp_cache = {}

        for volume in request.options:
            # safe default: not deletable
            volume['deletable'] = None

            node_uri, domain_uuid = urldefrag(volume['domainURI'])
            # Must be in a pool
            pool = self.get_pool(node_uri, volume['pool'])
            if not pool:
                continue
            # Pool must be modifiable
            if pool['type'] not in POOLS_RW:
                continue
            # Pool must be mapped to the file system
            pool_path = pool['path']
            if not pool_path:
                continue
            volume_path = volume['source']

            # check if volume is used by any other domain
            success, result = self.uvmm.send('STORAGE_VOLUME_USEDBY',
                                             None,
                                             volume=volume_path)
            if not success:
                raise UMC_Error(
                    _('Failed to check if the drive is used by any other virtual instance'
                      ))

            if len(result) > 1:  # is used by at least one other domain
                volume['deletable'] = False
                continue

            try:
                domain = _tmp_cache[volume['domainURI']]
            except LookupError:
                success, domain = self.uvmm.send('DOMAIN_INFO',
                                                 None,
                                                 uri=node_uri,
                                                 domain=domain_uuid)
                if not success:
                    raise UMC_Error(
                        _('Could not retrieve details for domain %s') %
                        domain_uuid)
                _tmp_cache[volume['domainURI']] = domain

            drive = None
            for disk in domain.disks:
                if disk.source == volume_path:
                    drive = disk
                    break
            else:
                continue

            volume['deletable'] = drive.device == Disk.DEVICE_DISK

        self.finished(request.id, request.options)
Exemplo n.º 52
0
    def _toc_from_navpoint(self, item, toc, navpoint):
        children = xpath(navpoint, 'ncx:navPoint')
        for child in children:
            title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
            title = COLLAPSE_RE.sub(' ', title.strip())
            href = xpath(child, 'ncx:content/@src')
            if not title:
                self._toc_from_navpoint(item, toc, child)
                continue
            if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
                # This node is useless
                continue
            href = item.abshref(urlnormalize(
                href[0])) if href and href[0] else ''
            path, _ = urldefrag(href)
            if href and path not in self.oeb.manifest.hrefs:
                self.logger.warn('TOC reference %r not found' % href)
                gc = xpath(child, 'ncx:navPoint')
                if not gc:
                    # This node is useless
                    continue
            id = child.get('id')
            klass = child.get('class', 'chapter')

            try:
                po = int(child.get('playOrder',
                                   self.oeb.toc.next_play_order()))
            except:
                po = self.oeb.toc.next_play_order()

            authorElement = xpath(
                child, 'descendant::calibre:meta[@name = "author"]')
            if authorElement:
                author = authorElement[0].text
            else:
                author = None

            descriptionElement = xpath(
                child, 'descendant::calibre:meta[@name = "description"]')
            if descriptionElement:
                description = etree.tostring(descriptionElement[0],
                                             method='text',
                                             encoding=unicode).strip()
                if not description:
                    description = None
            else:
                description = None

            index_image = xpath(
                child, 'descendant::calibre:meta[@name = "toc_thumbnail"]')
            toc_thumbnail = (index_image[0].text if index_image else None)
            if not toc_thumbnail or not toc_thumbnail.strip():
                toc_thumbnail = None

            node = toc.add(title,
                           href,
                           id=id,
                           klass=klass,
                           play_order=po,
                           description=description,
                           author=author,
                           toc_thumbnail=toc_thumbnail)

            self._toc_from_navpoint(item, node, child)
Exemplo n.º 53
0
def scandeps(base,
             doc,
             reffields,
             urlfields,
             loadref,
             urljoin=urlparse.urljoin):
    # type: (Text, Any, Set[Text], Set[Text], Callable[[Text, Text], Any], Callable[[Text, Text], Text]) -> List[Dict[Text, Text]]
    r = []  # type: List[Dict[Text, Text]]
    deps = None  # type: Dict[Text, Any]
    if isinstance(doc, dict):
        if "id" in doc:
            if doc["id"].startswith("file://"):
                df, _ = urlparse.urldefrag(doc["id"])
                if base != df:
                    r.append({"class": "File", "location": df})
                    base = df

        if doc.get("class") in ("File",
                                "Directory") and "location" in urlfields:
            u = doc.get("location", doc.get("path"))
            if u and not u.startswith("_:"):
                deps = {"class": doc["class"], "location": urljoin(base, u)}
                if doc["class"] == "Directory" and "listing" in doc:
                    deps["listing"] = doc["listing"]
                if doc["class"] == "File" and "secondaryFiles" in doc:
                    deps["secondaryFiles"] = doc["secondaryFiles"]
                deps = nestdir(base, deps)
                r.append(deps)
            else:
                if doc["class"] == "Directory" and "listing" in doc:
                    r.extend(
                        scandeps(base,
                                 doc["listing"],
                                 reffields,
                                 urlfields,
                                 loadref,
                                 urljoin=urljoin))
                elif doc["class"] == "File" and "secondaryFiles" in doc:
                    r.extend(
                        scandeps(base,
                                 doc["secondaryFiles"],
                                 reffields,
                                 urlfields,
                                 loadref,
                                 urljoin=urljoin))

        for k, v in doc.iteritems():
            if k in reffields:
                for u in aslist(v):
                    if isinstance(u, dict):
                        r.extend(
                            scandeps(base,
                                     u,
                                     reffields,
                                     urlfields,
                                     loadref,
                                     urljoin=urljoin))
                    else:
                        sub = loadref(base, u)
                        subid = urljoin(base, u)
                        deps = {"class": "File", "location": subid}
                        sf = scandeps(subid,
                                      sub,
                                      reffields,
                                      urlfields,
                                      loadref,
                                      urljoin=urljoin)
                        if sf:
                            deps["secondaryFiles"] = sf
                        deps = nestdir(base, deps)
                        r.append(deps)
            elif k in urlfields and k != "location":
                for u in aslist(v):
                    deps = {"class": "File", "location": urljoin(base, u)}
                    deps = nestdir(base, deps)
                    r.append(deps)
            elif k not in ("listing", "secondaryFiles"):
                r.extend(
                    scandeps(base,
                             v,
                             reffields,
                             urlfields,
                             loadref,
                             urljoin=urljoin))
    elif isinstance(doc, list):
        for d in doc:
            r.extend(
                scandeps(base,
                         d,
                         reffields,
                         urlfields,
                         loadref,
                         urljoin=urljoin))

    if r:
        normalizeFilesDirs(r)
        r = mergedirs(r)

    return r
Exemplo n.º 54
0
def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url
Exemplo n.º 55
0
def normalize(seed_url, link):
    link, _ = urlparse.urldefrag(
        link)  # 将link分解成去掉fragment的新url和去掉的fragment的二元组
    return urlparse.urljoin(seed_url, link)
Exemplo n.º 56
0
    def fetch(self):
        """Attempt to fetch the contents of the URL.

        If successful, and the data is HTML, extract further links and
        add them to the crawler.  Redirects are also added back there.
        """
        while self.tries < self.max_tries:
            self.tries += 1
            self.request = None
            try:
                self.request = Request(self.log, self.url, self.crawler.pool)
                yield From(self.request.connect())
                yield From(self.request.send_request())
                self.response = yield From(self.request.get_response())
                self.body = yield From(self.response.read())
                h_conn = self.response.get_header('connection').lower()
                if h_conn != 'close':
                    self.request.close(recycle=True)
                    self.request = None
                if self.tries > 1:
                    self.log(1, 'try', self.tries, 'for', self.url, 'success')
                break
            except (BadStatusLine, OSError) as exc:
                self.exceptions.append(exc)
                self.log(1, 'try', self.tries, 'for', self.url, 'raised',
                         repr(exc))
                ##import pdb; pdb.set_trace()
                # Don't reuse the connection in this case.
            finally:
                if self.request is not None:
                    self.request.close()
        else:
            # We never broke out of the while loop, i.e. all tries failed.
            self.log(0, 'no success for', self.url, 'in', self.max_tries,
                     'tries')
            return
        next_url = self.response.get_redirect_url()
        if next_url:
            self.next_url = urlparse.urljoin(self.url, next_url)
            if self.max_redirect > 0:
                self.log(1, 'redirect to', self.next_url, 'from', self.url)
                self.crawler.add_url(self.next_url, self.max_redirect - 1)
            else:
                self.log(0, 'redirect limit reached for', self.next_url,
                         'from', self.url)
        else:
            if self.response.status == 200:
                self.ctype = self.response.get_header('content-type')
                self.pdict = {}
                if self.ctype:
                    self.ctype, self.pdict = cgi.parse_header(self.ctype)
                self.encoding = self.pdict.get('charset', 'utf-8')
                if self.ctype == 'text/html':
                    body = self.body.decode(self.encoding, 'replace')
                    # Replace href with (?:href|src) to follow image links.
                    self.urls = set(
                        re.findall(r'(?i)href=["\']?([^\s"\'<>]+)', body))
                    if self.urls:
                        self.log(1, 'got', len(self.urls),
                                 'distinct urls from', self.url)
                    self.new_urls = set()
                    for url in self.urls:
                        url = unescape(url)
                        url = urlparse.urljoin(self.url, url)
                        url, frag = urlparse.urldefrag(url)
                        if self.crawler.add_url(url):
                            self.new_urls.add(url)
Exemplo n.º 57
0
    def analyze(self, task, find_sources=False):
        rv = False
        self.whitelist = [re.compile(x) for x in self.site.config['LINK_CHECK_WHITELIST']]
        base_url = urlparse(self.site.config['BASE_URL'])
        self.existing_targets.add(self.site.config['SITE_URL'])
        self.existing_targets.add(self.site.config['BASE_URL'])
        url_type = self.site.config['URL_TYPE']
        if url_type == 'absolute':
            url_netloc_to_root = urlparse(self.site.config['SITE_URL']).path
        try:
            filename = task.split(":")[-1]

            if filename.startswith(self.site.config['CACHE_FOLDER']):
                # Do not look at links in the cache, which are not parsed by
                # anyone and may result in false positives.  Problems arise
                # with galleries, for example.  Full rationale: (Issue #1447)
                self.logger.notice("Ignoring {0} (in cache, links may be incorrect)".format(filename))
                return False

            d = lxml.html.fromstring(open(filename, 'rb').read())
            for l in d.iterlinks():
                target = l[0].attrib[l[1]]
                if target == "#":
                    continue
                target, _ = urldefrag(target)
                parsed = urlparse(target)

                # Warn about links from https to http (mixed-security)
                if base_url.netloc == parsed.netloc and base_url.scheme == "https" and parsed.scheme == "http":
                    self.logger.warn("Mixed-content security for link in {0}: {1}".format(filename, target))

                # Absolute links when using only paths, skip.
                if (parsed.scheme or target.startswith('//')) and url_type in ('rel_path', 'full_path'):
                    continue

                # Absolute links to other domains, skip
                if (parsed.scheme or target.startswith('//')) and parsed.netloc != base_url.netloc:
                    continue

                if url_type == 'rel_path':
                    target_filename = os.path.abspath(
                        os.path.join(os.path.dirname(filename), unquote(target)))

                elif url_type in ('full_path', 'absolute'):
                    if url_type == 'absolute':
                        # convert to 'full_path' case, ie url relative to root
                        url_rel_path = target.path[len(url_netloc_to_root):]
                    else:
                        url_rel_path = target.path
                    if url_rel_path == '' or url_rel_path.endswith('/'):
                        url_rel_path = urljoin(url_rel_path, self.site.config['INDEX_FILE'])
                    fs_rel_path = fs_relpath_from_url_path(url_rel_path)
                    target_filename = os.path.join(self.site.config['OUTPUT_FOLDER'], fs_rel_path)

                if any(re.match(x, target_filename) for x in self.whitelist):
                    continue
                elif target_filename not in self.existing_targets:
                    if os.path.exists(target_filename):
                        self.logger.notice("Good link {0} => {1}".format(target, target_filename))
                        self.existing_targets.add(target_filename)
                    else:
                        rv = True
                        self.logger.warn("Broken link in {0}: {1}".format(filename, target))
                        if find_sources:
                            self.logger.warn("Possible sources:")
                            self.logger.warn("\n".join(_call_nikola_list(self.site, ["--deps", task])))
                            self.logger.warn("===============================\n")
        except Exception as exc:
            self.logger.error("Error with: {0} {1}".format(filename, exc))
        return rv
    def resolve_ref(self, ref, base_url=None):
        base_url = base_url or 'file://%s/' % os.path.abspath('.')

        obj = None

        # If `ref` is a dict, look for special directives.
        if isinstance(ref, dict):
            obj = ref
            if "import" in ref:
                if len(obj) == 1:
                    ref = obj["import"]
                    obj = None
                else:
                    raise ValueError("'import' must be the only field in %s" %
                                     (str(obj)))
            elif "include" in obj:
                if len(obj) == 1:
                    ref = obj["include"]
                else:
                    raise ValueError("'include' must be the only field in %s" %
                                     (str(obj)))
            else:
                if "id" in obj:
                    ref = obj["id"]
                else:
                    raise ValueError("Object `%s` does not have `id` field" %
                                     obj)

        if not isinstance(ref, basestring):
            raise ValueError("Must be string: `%s`" % str(ref))

        url = expand_url(ref, base_url)

        # Has this reference been loaded already?
        if url in self.idx:
            return self.idx[url]

        # "include" directive means load raw text
        if obj and "include" in obj:
            return self.fetch_text(url)

        if obj:
            obj["id"] = url
            self.idx[url] = obj
        else:
            # Load structured document
            doc_url, frg = urlparse.urldefrag(url)
            if doc_url in self.idx:
                raise validate.ValidationException(
                    "Reference `#%s` not found in file `%s`." % (frg, doc_url))
            obj = self.fetch(doc_url)

        # Recursively expand urls and resolve directives
        self.resolve_all(obj, url)

        # Requested reference should be in the index now, otherwise it's a bad reference
        if self.idx.get(url) is not None:
            return self.idx[url]
        else:
            raise RuntimeError(
                "Reference `%s` is not in the index.  Index contains:\n  %s" %
                (url, "\n  ".join(self.idx)))
Exemplo n.º 59
0
 def __init__(self, url):
     self.__url = urlparse.urldefrag(url)
Exemplo n.º 60
0
    def binary_to_text_inner(self, bin, buf, stack):
        (depth, tag_name, current_map, dynamic_tag, errors, in_censorship,
         is_goingdown, state, flags) = stack.pop()

        if state == 'close tag':
            if not tag_name:
                raise LitError('Tag ends before it begins.')
            buf.write(encode(u''.join(('</', tag_name, '>'))))
            dynamic_tag = 0
            tag_name = None
            state = 'text'

        while self.cpos < len(bin):
            c, self.cpos = read_utf8_char(bin, self.cpos)
            oc = ord(c)

            if state == 'text':
                if oc == 0:
                    state = 'get flags'
                    continue
                elif c == '\v':
                    c = '\n'
                elif c == '>':
                    c = '>>'
                elif c == '<':
                    c = '<<'
                buf.write(encode(c))

            elif state == 'get flags':
                if oc == 0:
                    state = 'text'
                    continue
                flags = oc
                state = 'get tag'

            elif state == 'get tag':
                state = 'text' if oc == 0 else 'get attr'
                if flags & FLAG_OPENING:
                    tag = oc
                    buf.write('<')
                    if not (flags & FLAG_CLOSING):
                        is_goingdown = True
                    if tag == 0x8000:
                        state = 'get custom length'
                        continue
                    if flags & FLAG_ATOM:
                        if not self.tag_atoms or tag not in self.tag_atoms:
                            raise LitError("atom tag %d not in atom tag list" %
                                           tag)
                        tag_name = self.tag_atoms[tag]
                        current_map = self.attr_atoms
                    elif tag < len(self.tag_map):
                        tag_name = self.tag_map[tag]
                        current_map = self.tag_to_attr_map[tag]
                    else:
                        dynamic_tag += 1
                        errors += 1
                        tag_name = '?' + unichr(tag) + '?'
                        current_map = self.tag_to_attr_map[tag]
                        print 'WARNING: tag %s unknown' % unichr(tag)
                    buf.write(encode(tag_name))
                elif flags & FLAG_CLOSING:
                    if depth == 0:
                        raise LitError('Extra closing tag %s at %d' %
                                       (tag_name, self.cpos))
                    break

            elif state == 'get attr':
                in_censorship = False
                if oc == 0:
                    state = 'text'
                    if not is_goingdown:
                        tag_name = None
                        dynamic_tag = 0
                        buf.write(' />')
                    else:
                        buf.write('>')
                        frame = (depth, tag_name, current_map, dynamic_tag,
                                 errors, in_censorship, False, 'close tag',
                                 flags)
                        stack.append(frame)
                        frame = (depth + 1, None, None, 0, 0, False, False,
                                 'text', 0)
                        stack.append(frame)
                        break
                else:
                    if oc == 0x8000:
                        state = 'get attr length'
                        continue
                    attr = None
                    if current_map and oc in current_map and current_map[oc]:
                        attr = current_map[oc]
                    elif oc in self.attr_map:
                        attr = self.attr_map[oc]
                    if not attr or not isinstance(attr, basestring):
                        raise LitError('Unknown attribute %d in tag %s' %
                                       (oc, tag_name))
                    if attr.startswith('%'):
                        in_censorship = True
                        state = 'get value length'
                        continue
                    buf.write(' ' + encode(attr) + '=')
                    if attr in ['href', 'src']:
                        state = 'get href length'
                    else:
                        state = 'get value length'

            elif state == 'get value length':
                if not in_censorship:
                    buf.write('"')
                count = oc - 1
                if count == 0:
                    if not in_censorship:
                        buf.write('"')
                    in_censorship = False
                    state = 'get attr'
                    continue
                state = 'get value'
                if oc == 0xffff:
                    continue
                if count < 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)

            elif state == 'get value':
                if count == 0xfffe:
                    if not in_censorship:
                        buf.write('%s"' % (oc - 1))
                    in_censorship = False
                    state = 'get attr'
                elif count > 0:
                    if not in_censorship:
                        if c == '"':
                            c = '&quot;'
                        elif c == '<':
                            c = '&lt;'
                        buf.write(c.encode('ascii', 'xmlcharrefreplace'))
                    count -= 1
                if count == 0:
                    if not in_censorship:
                        buf.write('"')
                    in_censorship = False
                    state = 'get attr'

            elif state == 'get custom length':
                count = oc - 1
                if count <= 0 or count > len(bin) - self.cpos:
                    raise LitError('Invalid character count %d' % count)
                dynamic_tag += 1
                state = 'get custom'
                tag_name = ''

            elif state == 'get custom':
                tag_name += c
                count -= 1
                if count == 0:
                    buf.write(encode(tag_name))
                    state = 'get attr'

            elif state == 'get attr length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                buf.write(' ')
                state = 'get custom attr'

            elif state == 'get custom attr':
                buf.write(encode(c))
                count -= 1
                if count == 0:
                    buf.write('=')
                    state = 'get value length'

            elif state == 'get href length':
                count = oc - 1
                if count <= 0 or count > (len(bin) - self.cpos):
                    raise LitError('Invalid character count %d' % count)
                href = ''
                state = 'get href'

            elif state == 'get href':
                href += c
                count -= 1
                if count == 0:
                    doc, frag = urldefrag(href[1:])
                    path = self.item_path(doc)
                    if frag:
                        path = '#'.join((path, frag))
                    path = urlnormalize(path)
                    buf.write(encode(u'"%s"' % path))
                    state = 'get attr'