Пример #1
0
def _tabulate(source, maxsearch=1, locale='ads', diskroot=adsDIR):
    """ tabulate all files in arXiv sources, add basic file details
    """ 
    print 'query: % s' % source   

    bkeys = ['bibcode','eprintid']
    try:
        data = loadadsJSON(source, validate=False, tags=bkeys)
    except:
        print 'json file did not parse fully'
        print 'will now treat as XML'
        data = parseADSXML(source, bkeys)

    bibcodes = data.keys()
    print 'total bibcodes: %s' % len(bibcodes)
    files = []
    for bib in bibcodes[:maxsearch]:
        epid = pparXiv(data[bib]['eprintid'], auth='arXiv')
        datasets = []
        binfo = [bib, data[bib]['eprintid']]
        if epid != {}:
            wdir, sources = getSources(epid, 
                                       locale=locale,
                                       diskroot=diskroot)
            content = []
            data[bib]['sources'] = []
            for s in sources:
                f = os.path.join(wdir, s['file'])
                tcontent = processSource(f, 
                                         type=s['type'],
                                         encoding=s['encoding'],
                                         action='tlist')
                for t in tcontent:
                    finfo = []
                    finfo.extend(binfo)
                    for k in fkeys:
                        finfo.append(tagFormat(k,t,tagformats=fkey_formats))
                    files.append(dict(zip(bkeys+fkeys,finfo)))
        else:
            finfo = []
            finfo.extend(binfo)
            finfo.extend([None]*n_fkeys)
            files.append(dict(zip(fkeys,finfo)))
    return files 
Пример #2
0
def _query(source, maxsearch=1, locale="ads", diskroot=adsDIR):
    """ root around arxiv source files for dataset ids
    """ 
    print 'query: % s' % source   
    try:
        data = loadadsJSON(source, validate=False, tags=default_tags)
    except:
        print 'json file did not parse fully'
        print 'will now treat as XML'
        data = parseADSXML(source, default_tags)
        #data = {}

    bibcodes = data.keys()
    print 'total bibcodes: %s' % len(bibcodes)
    sets = []
    for bib in bibcodes[:maxsearch]:
        print 'starting % s' % bib
        epid = pparXiv(data[bib]['eprintid'], auth='arXiv')
        datasets = []
        if epid != {}:
            wdir, sources = getSources(epid, 
                                       locale=locale, 
                                       diskroot=diskroot)
            content = []
            data[bib]['sources'] = []
            print sources
            for s in sources:
                f = os.path.join(wdir, s['file'])
                tcontent = processSource(f, type=s['type'],
                                         encoding=s['encoding'],
                                         action='read')
                if tcontent != []:
                    data[bib]['sources'].append(f)
                    content.extend(tcontent)
                                
            datasets = searchSource(content, cmd='dataset')
        else:
            print 'no eprint'
        data[bib]['datasets'] = datasets
        if datasets != []:
            sets.append(data[bib])
        print '\n'
    return sets, data