예제 #1
0
    def matchReference(self, ref, doc=None):
        """
            Returns the matching document in the db based on the guid, corpus_id,
            title and surnames of authors

            :param ref: reference dict
            :aram doc: SciDoc (optional). Only here to enable decendant classes to use
        """
        self.corpus.checkConnectedToDB()

        # try matching by ID first
        for id_type in [("doi","doi"),("pmid","corpus_id"),("corpus_id","corpus_id"),("guid","guid")]:
            if ref.get(id_type[0],"") not in ["",None]:
                doc_meta=self.corpus.getMetadataByField(id_type[1],ref[id_type[0]])
                if doc_meta:
                    return doc_meta

        # if it can't be matched by id
        norm_title=normalizeTitle(ref["title"])

        if not isinstance(norm_title, unicode):
            norm_title=unicode(norm_title, errors="ignore")

        rows=self.corpus.listFieldByField("metadata","norm_title",norm_title)

        for row in rows:
            doc_meta=row
##            doc_meta=json.loads(row[0]) # load metadata dict
            if len(doc_meta["surnames"]) > 0:
                for a1 in doc_meta["surnames"]:
                    for a2 in ref["surnames"]:
                        if a1 and a2 and a1.lower() == a2.lower():
                            # essentially, if ANY surname matches
                            return doc_meta
        return None
예제 #2
0
    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header=soup.find("firstpageheader")
        if header:
            title=header.find("title")
            if title:
                newDocument.metadata["title"]=title.text

        path,fname=os.path.split(filename)
        metafilename=re.sub(r"(.*)-paper.xml",r"\1.xml",fname,flags=re.IGNORECASE)
        metafilename=os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ",metafilename)
            bib_data=None

        if bib_data:
            entry=bib_data.entries[bib_data.entries.keys()[0]]
            for field in entry.fields:
                newDocument.metadata[field]=entry.fields[field].replace(u"\u2013",u"-")

        authors=[]
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"]=authors
        newDocument["metadata"]["surnames"]=[a["family"] for a in authors]
        newDocument["metadata"]["norm_title"]=normalizeTitle(newDocument["metadata"]["title"])
예제 #3
0
def convertXMLAndAddToCorpus(file_path, corpus_id, import_id, collection_id,
    import_options, xml_string=None, existing_guid=None):
    """
        Reads the input XML and saves a SciDoc
    """
    update_existing=False
    if not existing_guid:
        existing_guid=cp.Corpus.getMetadataByField("metadata.corpus_id", corpus_id)

    if existing_guid:
        if not import_options.get("reload_xml_if_doc_in_collection",False):
            print("Document %s is already in the collection. Ignoring." % corpus_id)
            return
        update_existing=True

    reader=AutoXMLReader()
##    try:
    if xml_string:
        doc=reader.read(xml_string, file_path)
    else:
        doc=reader.readFile(file_path)
##    except:
##        logging.exception("Could not read file.")
##        return

    doc.metadata["norm_title"]=normalizeTitle(doc.metadata["title"])

    if update_existing:
        doc.metadata["guid"]=existing_guid
    elif doc.metadata.get("guid", "") == "":
        doc.metadata["guid"]=cp.Corpus.generateGUID(doc.metadata)

    if doc.metadata.get("corpus_id", "") == "":
        doc.metadata["corpus_id"]=corpus_id

    cp.Corpus.saveSciDoc(doc)

    if not update_existing:
        addSciDocToDB(doc, import_id, collection_id)

    return doc
예제 #4
0
def addSciDocToDB(doc, import_id, collection_id):
    """
        Extends metadata from doc and adds to database
    """
    meta=deepcopy(doc["metadata"])

    if meta.get("corpus_id","")=="":
        meta["corpus_id"]=meta["pm_id"] if meta.has_key("pm_id") else ""

    meta["norm_title"]=normalizeTitle(meta["title"])
    meta["numref"]=str(len(doc["references"]))
    meta["outlinks"]=[]
    meta["inlinks"]=[]
    meta["num_citations"]=len(doc["citations"])

    # this is for later processing and adding to database
    meta["num_in_collection_references"]=0
    meta["num_references"]=len(doc["references"])
    meta["num_resolvable_citations"]=0
    meta["num_citations"]=0
    meta["import_id"]=import_id
    meta["collection_id"]=collection_id
    cp.Corpus.addPaper(meta, check_existing=False)