def createFingerprints(inDir, updateIds=None): map0 = {} # doi -> articleId map1 = {} # issn/vol/page -> articleId map2 = {} # author, title, year -> articleId artIds = {} # articleId -> (extId, doi, pmid) global noIssuePage noIssuePage = 0 global noIssn noIssn = 0 logging.info("Fingerprinting %s" % inDir) count = 0 for row in pubStore.iterArticleDataDir(inDir, type="articles", updateIds=updateIds): articleId = int(row.articleId) fprint0 = getFingerprint0(row) addFprint(map0, fprint0, articleId) fprint1 = getFingerprint1(row) addFprint(map1, fprint1, articleId) fprint2 = getFingerprint2(row) addFprint(map2, fprint2, articleId) artIds[articleId] = (row.externalId, row.doi, int(row.pmid)) count += 1 return artIds, map0, map1, map2, noIssn, noIssuePage
def getIssnPmidDict(medlineDir, updateIds, minYear): """ go over medline articles and collect a printIssn -> pmidList dictionary return to dicts: issn -> set of pmids, issn -> journal name """ issnToPmid = defaultdict(set) issnToJournal = {} pmidCount = 0 noIssnPmidCount = 0 noMinYearCount = 0 issnToJournal = {} logging.info("Reading ISSN/PMID assignment from directory %s" % medlineDir) for artData in pubStore.iterArticleDataDir(medlineDir, updateIds=updateIds): issn = artData.printIssn if issn=="": issn = artData.eIssn if issn=="": #oIssnCount.add(artData.pmid) logging.debug("PMID %s has not Issn" % artData.pmid) noIssnPmidCount += 1 continue if minYear!=None and artData.year.isdigit() and not int(artData.year) >= minYear: logging.debug("PMID %s is too early" % artData.pmid) noMinYearCount += 1 continue issnToPmid[issn].add(int(artData.pmid)) issnToJournal[issn] = unidecode.unidecode(artData.journal) #pmids.add(artData.pmid) pmidCount += 1 logging.info("Got %d PMIDs for %d ISSNs" % (pmidCount, len(issnToPmid))) logging.info("No info for %d PMIDs, %d PMIDs did not fulfill the minYear" % \ (noIssnPmidCount, noMinYearCount)) return issnToPmid, issnToJournal
def createFingerprints(inDir, updateIds=None): map0 = {} # doi -> articleId map1 = {} # issn/vol/page -> articleId map2 = {} # author, title, year -> articleId artIds = {} # articleId -> (extId, doi, pmid) global noIssuePage noIssuePage = 0 global noIssn noIssn = 0 logging.info("Fingerprinting %s" % inDir) count = 0 for row in pubStore.iterArticleDataDir(inDir, type="articles", updateIds = updateIds): articleId = int(row.articleId) fprint0 = getFingerprint0(row) addFprint(map0, fprint0, articleId) fprint1 = getFingerprint1(row) addFprint(map1, fprint1, articleId) fprint2 = getFingerprint2(row) addFprint(map2, fprint2, articleId) artIds[articleId] = (row.externalId, row.doi, int(row.pmid)) count += 1 return artIds, map0, map1, map2, noIssn, noIssuePage
def getIssnPmidDict(medlineDir, updateIds, minYear): """ go over medline articles and collect a printIssn -> pmidList dictionary return to dicts: issn -> set of pmids, issn -> journal name """ issnToPmid = defaultdict(set) issnToJournal = {} pmidCount = 0 noIssnPmidCount = 0 noMinYearCount = 0 issnToJournal = {} logging.info("Reading ISSN/PMID assignment from directory %s" % medlineDir) for artData in pubStore.iterArticleDataDir(medlineDir, updateIds=updateIds): issn = artData.printIssn if issn == "": issn = artData.eIssn if issn == "": #oIssnCount.add(artData.pmid) logging.debug("PMID %s has not Issn" % artData.pmid) noIssnPmidCount += 1 continue if minYear != None and artData.year.isdigit() and not int( artData.year) >= minYear: logging.debug("PMID %s is too early" % artData.pmid) noMinYearCount += 1 continue issnToPmid[issn].add(int(artData.pmid)) issnToJournal[issn] = unidecode.unidecode(artData.journal) #pmids.add(artData.pmid) pmidCount += 1 logging.info("Got %d PMIDs for %d ISSNs" % (pmidCount, len(issnToPmid))) logging.info("No info for %d PMIDs, %d PMIDs did not fulfill the minYear" % \ (noIssnPmidCount, noMinYearCount)) return issnToPmid, issnToJournal