def replace_paper(c, paper, url, alljel): """Update a single paper record.""" blob = json.dumps(paper, ensure_ascii=False).encode(encoding='utf-8') paper = collect(paper) r = {} r['url'] = url r['handle'] = paper['handle'][0] r['template'] = parse_template(paper['template-type'][0]) for f in ['title', 'abstract', 'journal', 'volume', 'issue', 'pages']: r[f] = paper.get(f, [None])[0] for f in ['title', 'abstract', 'journal']: r[f] = sanitize(r[f]) r['language'] = paper.get('language', ['none'])[0].lower() r['language'] = r['language'] if len(r['language']) == 2 else None r['language'] = lang_and(r['title'], r['abstract'], default=r['language']) r['year'] = get_year(paper) r['redif'] = zlib.compress(blob, level=9) sql = 'REPLACE INTO papers (' + ', '.join(k for k in r.keys()) + ')' sql += ' VALUES (' + ', '.join(['?'] * len(r)) + ')' c.execute(sql, list(r.values())) pid = c.lastrowid if 'author-name' in paper: authors = [sanitize(n) for n in paper['author-name']] authors = [(pid, n) for n in authors if n] c.executemany('INSERT INTO authors (pid, name) VALUES (?, ?)', authors) if 'classification-jel' in paper: jel = parsejel(paper['classification-jel'][0], alljel) jel = [(pid, c) for c in jel] c.executemany('INSERT INTO papers_jel (pid, code) VALUES (?, ?)', jel)
def update_remotes(conn, status=1): """Update the list of remotes in the database.""" c = conn.cursor() # Archives sql = 'SELECT handle, url FROM series, repec USING (file)' sql += ' WHERE series.type = "arch" ORDER BY ftpdate DESC' c.execute(sql) archives = [(h.lower(), u) for h, u in c.fetchall()] archives = {k: v[0] for k, v in collect(archives).items()} c.execute('UPDATE series SET status = 0 WHERE type = "arch"') # Series def lookup(s): a, sep, f = s[1].rpartition(':') a = archives.get(a.lower()) if a: return (a + f + '/', 0, None, *s) else: return (None, 2, 'Archive not found', *s) sql = 'SELECT file, handle FROM series WHERE type = "seri" AND status = ?' c.execute(sql, (status, )) series = [lookup(s) for s in c.fetchall()] sql = ('UPDATE series SET url = ?, status = ?, error = ?' ' WHERE file = ? AND handle = ?') c.executemany(sql, series) # Remotes remotes = list(set((s[0], ) for s in series if s[0])) c.executemany('REPLACE INTO remotes (url) VALUES (?)', remotes) c.close()
def collect_names(files): """Download files and collect handle -> name associations.""" handles = {} for i, file in enumerate(files): print(f'[{i+1}/{len(files)}] {file}...') try: rdf = redif.load(redif.decode(ftp_get(settings.repec_ftp + file))) for record in rdf: record = collect(record) if 'name' in record: # Account for inconsistent capitalization across records handle = record['handle'][0].lower() newname = record['name'][0] oldname = handles.setdefault(handle, newname) if newname != oldname: print(f'Conflicting names: "{oldname}" vs. "{newname}"' f' in {handle}') except Exception: print(f'Skipping {file} due to errors') return handles