Exemplo n.º 1
0
    def prettyPrintSentenceHTML(self, s, text_formatting_function=lambda x:x, glob=None):
        """
            Fill in the <cit> placeholders with a nice representation of each reference, and
            call formatting function on each
        """
        text=s["text"]
        for count in range(2):
            text=re.sub(r"(<cit.*?/>)\s*?(<cit.*?/>)",r"\1, \2",text,0,re.IGNORECASE|re.DOTALL)

        text=re.sub(r"<cit\sid=\"?(.+?)\"?\s*?/>",r"__cit__\1",text, flags=re.DOTALL|re.IGNORECASE)
        text=text_formatting_function(text, glob)

        for cit_id in s.get("citations",[]):
            match=self.doc.matchReferenceByCitationId(cit_id)
            if match:
                sub=formatCitation(match)
##                sub=u"<span class=\"citation\" id=\"%s\" ref_id=\"%s\">%s</span>" % (cit_id, match["id"], sub)
                sub=u"<a href=\"#%s\" class=\"citation\" id=\"%s\">%s</a>" % (match["id"], cit_id, sub)
                sub=glob["citation_formatting_function"](sub, match)
            else:
                sub="[MISSING REFERENCE "+cit_id+")] "

            text=re.sub(r"__cit__"+str(cit_id),sub,text, flags=re.DOTALL|re.IGNORECASE)

        text=text.strip()
        if len(text) > 0 and text[-1] not in [".",",","!","?",":",";"]:
            text+=u"."
        return text
    def generateVisualizationOneFile(self, guid):
        """
            Given a guid, it prepares its explainer document
        """
        doc=cp.Corpus.loadSciDoc(guid)
        cp.Corpus.tagAllReferencesAsInCollectionOrNot(doc)
        counts1=self.getDocumentTokens(doc)

        # generate a unique id for each unique term, make a dictionary
        for index, token in enumerate(counts1):
            self.term_info[token]={"token_id":str(index), "references": []}

        self.overlapping_tokens={}

        ref_data={}

        in_collection_references=cp.Corpus.getMetadataByGUID(guid)["outlinks"]
        for ref in doc["references"]:
            match=cp.Corpus.matchReferenceInIndex(ref)
            if match:
                doc2=cp.Corpus.loadSciDoc(match["guid"])
                counts2=self.getDocumentTokens(doc2)
                # for each in_collection_reference number (0 onwards) we store the list
                # of its overlapping tokens with the current document

                self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2)

                for token in self.overlapping_tokens[ref["id"]]:
                    ref_list=self.term_info[token]["references"]
                    if ref["id"] not in ref_list:
                        ref_list.append(ref["id"])

                ref_html=doc2.prettyPrintDocumentHTML(
                    True,
                    True,
                    False,
        ##            extra_attribute_function=self.extraAttributes,
##                    citation_formatting_function=self.citationFormatting,
##                    reference_formatting_function=self.referenceFormatting,
                    text_formatting_function=self.textFormatting
                    )

                details="%s - %s - %s" % (match["guid"], formatCitation(ref), doc2["metadata"]["title"])
                ref_record={"full_html":ref_html, "details":details}
                ref_data[ref["id"]]=ref_record

        # try to find some signal in the noise
        self.filterTokens(doc)

        html=doc.prettyPrintDocumentHTML(
            True,
            True,
            False,
##            extra_attribute_function=self.extraAttributes,
            citation_formatting_function=self.citationFormatting,
            reference_formatting_function=self.referenceFormatting,
            text_formatting_function=self.textFormatting
            )
##        html=self.padWithHTML(html, guid)
        token_data={"full_html":html, "ref_data":ref_data}
        json_str=json.dumps(token_data)
        json_file=guid+"_data.json"
        writeFileText(json_str, self.output_dir+json_file)
        # TODO: generate file description
        # TODO: add title
        details="%s - %s - %s" % (guid, formatCitation(doc["metadata"]), doc["metadata"]["title"])

        file_info={
            "json_file":json_file,
            "id":guid,
            "title":doc["metadata"]["title"],
            "details": details}
        return file_info