def explainAnchorTextZoning(guid, max_inlinks=10, use_full_text=False): """ This generates a clipping collection file, including all the citation contexts of other files to this file """ meta=cp.Corpus.getMetadataByGUID(guid) all_html=["""<h1 class="title">%s</h1><span>Inlink context summary for %s</span>""" % (meta["title"],formatCitation(meta))] global CURRENT_CITATION CURRENT_CITATION=re.escape(formatCitation(meta)) for index, link in enumerate(meta["inlinks"]): if index == max_inlinks: break print("Processing anchor text from %s" % link) doc=cp.Corpus.loadSciDoc(link) if not use_full_text: trimDocToRelevantBits(doc, guid) renderer=SciDocRenderer(doc) html=renderer.prettyPrintDocumentHTML( formatspans=True, include_bibliography=use_full_text, wrap_with_HTML_tags=False, extra_attribute_function=extraAttributes, citation_formatting_function=citationFormatting, reference_formatting_function=referenceFormatting) all_html.append(html) html=padWithHTML(" ".join(all_html)) writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_ilc_zoning.html"))
def trimDocToRelevantBits(doc, guid): """ Selects only the paragraphs in the document where the paper identified by ``guid`` is cited, returns them as a modified scidoc. Warning: modifies doc in-place. """ linked_ref=None for ref in doc.references: ref_guid=None if "guid" in ref: ref_guid=ref["guid"] else: match=cp.Corpus.matcher.matchReference(ref) if match: ref_guid=match["guid"] if ref_guid==guid: linked_ref=ref break if not linked_ref: # couldn't match the relevant citation, return return new_sect=doc.addSection("root", "") para=doc.addParagraph(new_sect["id"]) sent=doc.addSentence(para["id"],"MATCHED REFERENCE: %s" % formatCitation(linked_ref)) content=[new_sect, para, sent] to_add=[] for cit_id in linked_ref["citations"]: cit=doc.citation_by_id[cit_id] parent_p=None if "parent_p" in cit: para=cit["parent_p"] else: parent_s=cit.get("parent_s",None) or cit["parent"] sent=doc.element_by_id[parent_s] para=doc.element_by_id[sent["parent"]] para["parent"]=new_sect["id"] to_add.append(para) to_add.extend([doc.element_by_id[s] for s in para["content"]]) to_add=list(set([element["id"] for element in to_add])) to_add=[doc.element_by_id[id] for id in to_add] for element in to_add: if element.get("type","") == "p": new_sect["content"].append(element["id"]) content.extend(to_add) doc["content"]=content doc.updateContentLists()