def fixPaperReferences(annotated_file, pmc_file, pmc_id, original_text=None):
    """
        Replaces the <ref-list> section in `annotated_file` with that from
        `pmc_file`

        Checking that they actually the same file is done outside.
    """
    annotated_text=loadFileText(annotated_file)
    if not original_text:
        original_text=loadFileText(pmc_file)

    try:
        orig_start, orig_end=selectRefListSection(original_text,  pmc_file, pmc_id)
    except ValueError:
        return

    original_refs=original_text[orig_start:orig_end]

    try:
        annot_start, annot_end=selectRefListSection(annotated_text, annotated_file, getFilePMCID(annotated_file))
    except ValueError:
        return

    new_annotated_text=annotated_text[:annot_start]+original_text[orig_start:orig_end]+annotated_text[annot_end:]
    writeFileText(new_annotated_text, annotated_file)
示例#2
0
def generateSideBySide(doc_list):
    """
        Generates side-by-side visualizations of a Paper XML: one using an XML to HTML
        converter, one loading the XML into SciDocJSON and rendering it back as HTML
    """
    from subprocess import Popen

    reader=PaperXMLReader()
    output_dir="g:\\nlp\\phd\\aac\\conversion_visualization\\"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_list=[]
    for filename in doc_list:
        print("Converting %s" % filename)
        input_file=cp.Corpus.paths.inputXML+filename
        output_file=output_dir+"%s_1.html" % os.path.basename(filename)

        input_text=loadFileText(input_file)
        writeFileText(input_text,output_file)

        doc=reader.read(input_text, input_file)
        try:
            json.dumps(doc.data)
        except:
            print("Not JSON Serializable!!!!")

        html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True)
        output_file2=output_file.replace("_1.html","_2.html")
        writeFileText(html,output_file2)
        file_list.append([os.path.basename(output_file),os.path.basename(output_file2)])

    file_list_json="file_data=%s;" % json.dumps(file_list)
    writeFileText(file_list_json,output_dir+"file_data.json")
示例#3
0
def convertAANmetadata(infile):
    """
        Load strange text file format from AAN, convert to CSV.

        WARNING: breaks backwards compatibility

        Args:
            infile: path to acl-metadata.txt
        Returns:
            returns a dict where [id] = {"authors", "title", etc.}
    """
    alltext=loadFileText(infile)
    filedict={}

    for match in re.finditer(r"id\s\=\s\{(.+?)\}\nauthor\s\=\s\{(.+?)\}\ntitle\s\=\s\{(.+?)\}\nvenue\s\=\s\{(.+?)}\nyear\s\=\s\{(.+?)\}",alltext,re.IGNORECASE):
        fn=match.group(1).lower()
        authors=match.group(2).split(";")
        surnames=[]
        parsed_authors=[]
        for a in authors:
            bits=a.split(",")
            surnames.append(bits[0].strip())
            parsed_authors.append({"given":"".join(bits[1:]).strip(),"family":bits[0].strip()})
        title=match.group(3)
        conference=match.group(4)
        year=match.group(5)
        filedict[fn]={"authors":parsed_authors,"surnames":surnames, "title":title, "conference":conference, "year":year, "corpus_id":fn}
        author_string="["+",".join(authors)+"]"

    return filedict
示例#4
0
 def readFile(self, filename):
     """
         Args:
             filename: full path to file to read
     """
     text=loadFileText(filename)
     return self.read(text, filename)
示例#5
0
def getPaperPMCID(filename):
    """
        Loads JATS file, returns its pmcid and the loaded text, or None if pmcid not found
    """
    original_text = loadFileText(filename)
    pmcid = re.search(r"<article-id pub-id-type=\"pmcid\">(.*?)</article-id>", original_text, re.IGNORECASE)
    if not pmcid:
        print ("File %s has no original pmcid " % filename)
        return None
    return pmcid.group(1), original_text
示例#6
0
def fixPaperReferences(annotated_file, pmc_file, original_text=None):
    """
        Replaces the <ref-list> section in `annotated_file` with that from
        `pmc_file`

        Checking that they actually the same file is done outside.
    """
    annotated_text = loadFileText(annotated_file)
    if not original_text:
        original_text = loadFileText(pmc_file)

    orig_start, orig_end = selectRefListSection(original_text, pmc_file, pmc_id)
    original_refs = original_text[orig_start:orig_end]

    annot_start, annot_end = selectRefListSection(original_text)
    original_refs = original_text[orig_start:orig_end]

    annotated_text = annotated_text[:annot_start] + original_text[orig_start:orig_end] + annotated_text[annot_end:]
    writeFileText(annotated_text, annotated_file)
示例#7
0
    def readFile(self, filename):
        """
            Load an XML file into a SciDoc.

            Args:
                filename: full path to file to read
            Returns:
                SciDoc instance
        """
        text=loadFileText(filename)
        return self.read(text, filename)
示例#8
0
def basicTest():
    print (__file__)
    import minerva.db.corpora as cp
    drive="g"
    cp.useLocalCorpus()
    cp.Corpus.connectCorpus(drive+":\\nlp\\phd\\pmc")

    from minerva.proc.general_utils import loadFileText
    from minerva.scidoc.xmlformats.read_jatsxml import JATSXMLReader
    reader = JATSXMLReader()
    doc=reader.read(loadFileText(r"G:\NLP\PhD\pmc\inputXML\articles.O-Z\PLoS_ONE\\PLoS_One_2013_Dec_20_8(12)_e85076.nxml"),"one")

##    renderer=CSLRenderer(doc,".." + os.sep + "cit_styles" + os.sep + 'ama.csl')
    renderer=CSLRenderer(doc,"ama")

    print("Citations\n\n")
    for cit in doc.citations:
        print(renderer.getCitationText(cit))

    print("Bibliography\n\n")
    for line in renderer.getBibliography():
        print(line)