def prettyPrintSentenceHTML(self, s, text_formatting_function=lambda x:x, glob=None): """ Fill in the <cit> placeholders with a nice representation of each reference, and call formatting function on each """ text=s["text"] for count in range(2): text=re.sub(r"(<cit.*?/>)\s*?(<cit.*?/>)",r"\1, \2",text,0,re.IGNORECASE|re.DOTALL) text=re.sub(r"<cit\sid=\"?(.+?)\"?\s*?/>",r"__cit__\1",text, flags=re.DOTALL|re.IGNORECASE) text=text_formatting_function(text, glob) for cit_id in s.get("citations",[]): match=self.doc.matchReferenceByCitationId(cit_id) if match: sub=formatCitation(match) ## sub=u"<span class=\"citation\" id=\"%s\" ref_id=\"%s\">%s</span>" % (cit_id, match["id"], sub) sub=u"<a href=\"#%s\" class=\"citation\" id=\"%s\">%s</a>" % (match["id"], cit_id, sub) sub=glob["citation_formatting_function"](sub, match) else: sub="[MISSING REFERENCE "+cit_id+")] " text=re.sub(r"__cit__"+str(cit_id),sub,text, flags=re.DOTALL|re.IGNORECASE) text=text.strip() if len(text) > 0 and text[-1] not in [".",",","!","?",":",";"]: text+=u"." return text
def generateVisualizationOneFile(self, guid): """ Given a guid, it prepares its explainer document """ doc=cp.Corpus.loadSciDoc(guid) cp.Corpus.tagAllReferencesAsInCollectionOrNot(doc) counts1=self.getDocumentTokens(doc) # generate a unique id for each unique term, make a dictionary for index, token in enumerate(counts1): self.term_info[token]={"token_id":str(index), "references": []} self.overlapping_tokens={} ref_data={} in_collection_references=cp.Corpus.getMetadataByGUID(guid)["outlinks"] for ref in doc["references"]: match=cp.Corpus.matchReferenceInIndex(ref) if match: doc2=cp.Corpus.loadSciDoc(match["guid"]) counts2=self.getDocumentTokens(doc2) # for each in_collection_reference number (0 onwards) we store the list # of its overlapping tokens with the current document self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2) for token in self.overlapping_tokens[ref["id"]]: ref_list=self.term_info[token]["references"] if ref["id"] not in ref_list: ref_list.append(ref["id"]) ref_html=doc2.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, ## citation_formatting_function=self.citationFormatting, ## reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) details="%s - %s - %s" % (match["guid"], formatCitation(ref), doc2["metadata"]["title"]) ref_record={"full_html":ref_html, "details":details} ref_data[ref["id"]]=ref_record # try to find some signal in the noise self.filterTokens(doc) html=doc.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, citation_formatting_function=self.citationFormatting, reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) ## html=self.padWithHTML(html, guid) token_data={"full_html":html, "ref_data":ref_data} json_str=json.dumps(token_data) json_file=guid+"_data.json" writeFileText(json_str, self.output_dir+json_file) # TODO: generate file description # TODO: add title details="%s - %s - %s" % (guid, formatCitation(doc["metadata"]), doc["metadata"]["title"]) file_info={ "json_file":json_file, "id":guid, "title":doc["metadata"]["title"], "details": details} return file_info