示例#1
0
文件: papers.py 项目: chkgk/repec
def replace_paper(c, paper, url, alljel):
    """Update a single paper record."""
    blob = json.dumps(paper, ensure_ascii=False).encode(encoding='utf-8')
    paper = collect(paper)
    r = {}
    r['url'] = url
    r['handle'] = paper['handle'][0]
    r['template'] = parse_template(paper['template-type'][0])
    for f in ['title', 'abstract', 'journal', 'volume', 'issue', 'pages']:
        r[f] = paper.get(f, [None])[0]
    for f in ['title', 'abstract', 'journal']:
        r[f] = sanitize(r[f])
    r['language'] = paper.get('language', ['none'])[0].lower()
    r['language'] = r['language'] if len(r['language']) == 2 else None
    r['language'] = lang_and(r['title'], r['abstract'], default=r['language'])
    r['year'] = get_year(paper)
    r['redif'] = zlib.compress(blob, level=9)

    sql = 'REPLACE INTO papers (' + ', '.join(k for k in r.keys()) + ')'
    sql += ' VALUES (' + ', '.join(['?'] * len(r)) + ')'
    c.execute(sql, list(r.values()))
    pid = c.lastrowid

    if 'author-name' in paper:
        authors = [sanitize(n) for n in paper['author-name']]
        authors = [(pid, n) for n in authors if n]
        c.executemany('INSERT INTO authors (pid, name) VALUES (?, ?)', authors)
    if 'classification-jel' in paper:
        jel = parsejel(paper['classification-jel'][0], alljel)
        jel = [(pid, c) for c in jel]
        c.executemany('INSERT INTO papers_jel (pid, code) VALUES (?, ?)', jel)
示例#2
0
def update_remotes(conn, status=1):
    """Update the list of remotes in the database."""
    c = conn.cursor()

    # Archives
    sql = 'SELECT handle, url FROM series, repec USING (file)'
    sql += ' WHERE series.type = "arch" ORDER BY ftpdate DESC'
    c.execute(sql)
    archives = [(h.lower(), u) for h, u in c.fetchall()]
    archives = {k: v[0] for k, v in collect(archives).items()}
    c.execute('UPDATE series SET status = 0 WHERE type = "arch"')

    # Series
    def lookup(s):
        a, sep, f = s[1].rpartition(':')
        a = archives.get(a.lower())
        if a:
            return (a + f + '/', 0, None, *s)
        else:
            return (None, 2, 'Archive not found', *s)

    sql = 'SELECT file, handle FROM series WHERE type = "seri" AND status = ?'
    c.execute(sql, (status, ))
    series = [lookup(s) for s in c.fetchall()]
    sql = ('UPDATE series SET url = ?, status = ?, error = ?'
           ' WHERE file = ? AND handle = ?')
    c.executemany(sql, series)

    # Remotes
    remotes = list(set((s[0], ) for s in series if s[0]))
    c.executemany('REPLACE INTO remotes (url) VALUES (?)', remotes)
    c.close()
示例#3
0
def collect_names(files):
    """Download files and collect handle -> name associations."""
    handles = {}
    for i, file in enumerate(files):
        print(f'[{i+1}/{len(files)}] {file}...')
        try:
            rdf = redif.load(redif.decode(ftp_get(settings.repec_ftp + file)))
            for record in rdf:
                record = collect(record)
                if 'name' in record:
                    # Account for inconsistent capitalization across records
                    handle = record['handle'][0].lower()
                    newname = record['name'][0]
                    oldname = handles.setdefault(handle, newname)
                    if newname != oldname:
                        print(f'Conflicting names: "{oldname}" vs. "{newname}"'
                              f' in {handle}')
        except Exception:
            print(f'Skipping {file} due to errors')
    return handles