예제 #1
0
def explainAnchorTextZoning(guid, max_inlinks=10, use_full_text=False):
    """
        This generates a clipping collection file, including all the citation
        contexts of other files to this file
    """
    meta=cp.Corpus.getMetadataByGUID(guid)
    all_html=["""<h1 class="title">%s</h1><span>Inlink context summary for %s</span>""" % (meta["title"],formatCitation(meta))]
    global CURRENT_CITATION
    CURRENT_CITATION=re.escape(formatCitation(meta))

    for index, link in enumerate(meta["inlinks"]):
        if index == max_inlinks:
            break
        print("Processing anchor text from %s" % link)
        doc=cp.Corpus.loadSciDoc(link)

        if not use_full_text:
            trimDocToRelevantBits(doc, guid)

        renderer=SciDocRenderer(doc)
        html=renderer.prettyPrintDocumentHTML(
            formatspans=True,
            include_bibliography=use_full_text,
            wrap_with_HTML_tags=False,
            extra_attribute_function=extraAttributes,
            citation_formatting_function=citationFormatting,
            reference_formatting_function=referenceFormatting)
        all_html.append(html)

    html=padWithHTML(" ".join(all_html))
    writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_ilc_zoning.html"))
예제 #2
0
def trimDocToRelevantBits(doc, guid):
    """
        Selects only the paragraphs in the document where the paper identified by
        ``guid`` is cited, returns them as a modified scidoc.

        Warning: modifies doc in-place.
    """
    linked_ref=None
    for ref in doc.references:
        ref_guid=None
        if "guid" in ref:
            ref_guid=ref["guid"]
        else:
            match=cp.Corpus.matcher.matchReference(ref)
            if match:
                ref_guid=match["guid"]

        if ref_guid==guid:
            linked_ref=ref
            break

    if not linked_ref:
        # couldn't match the relevant citation, return
        return

    new_sect=doc.addSection("root", "")
    para=doc.addParagraph(new_sect["id"])
    sent=doc.addSentence(para["id"],"MATCHED REFERENCE: %s" % formatCitation(linked_ref))
    content=[new_sect, para, sent]
    to_add=[]

    for cit_id in linked_ref["citations"]:
        cit=doc.citation_by_id[cit_id]
        parent_p=None
        if "parent_p" in cit:
            para=cit["parent_p"]
        else:
            parent_s=cit.get("parent_s",None) or cit["parent"]
            sent=doc.element_by_id[parent_s]
            para=doc.element_by_id[sent["parent"]]
        para["parent"]=new_sect["id"]
        to_add.append(para)
        to_add.extend([doc.element_by_id[s] for s in para["content"]])

    to_add=list(set([element["id"] for element in to_add]))
    to_add=[doc.element_by_id[id] for id in to_add]
    for element in to_add:
        if element.get("type","") == "p":
            new_sect["content"].append(element["id"])

    content.extend(to_add)
    doc["content"]=content
    doc.updateContentLists()