def _tabulate(source, maxsearch=1, locale='ads', diskroot=adsDIR): """ tabulate all files in arXiv sources, add basic file details """ print 'query: % s' % source bkeys = ['bibcode','eprintid'] try: data = loadadsJSON(source, validate=False, tags=bkeys) except: print 'json file did not parse fully' print 'will now treat as XML' data = parseADSXML(source, bkeys) bibcodes = data.keys() print 'total bibcodes: %s' % len(bibcodes) files = [] for bib in bibcodes[:maxsearch]: epid = pparXiv(data[bib]['eprintid'], auth='arXiv') datasets = [] binfo = [bib, data[bib]['eprintid']] if epid != {}: wdir, sources = getSources(epid, locale=locale, diskroot=diskroot) content = [] data[bib]['sources'] = [] for s in sources: f = os.path.join(wdir, s['file']) tcontent = processSource(f, type=s['type'], encoding=s['encoding'], action='tlist') for t in tcontent: finfo = [] finfo.extend(binfo) for k in fkeys: finfo.append(tagFormat(k,t,tagformats=fkey_formats)) files.append(dict(zip(bkeys+fkeys,finfo))) else: finfo = [] finfo.extend(binfo) finfo.extend([None]*n_fkeys) files.append(dict(zip(fkeys,finfo))) return files
def _query(source, maxsearch=1, locale="ads", diskroot=adsDIR): """ root around arxiv source files for dataset ids """ print 'query: % s' % source try: data = loadadsJSON(source, validate=False, tags=default_tags) except: print 'json file did not parse fully' print 'will now treat as XML' data = parseADSXML(source, default_tags) #data = {} bibcodes = data.keys() print 'total bibcodes: %s' % len(bibcodes) sets = [] for bib in bibcodes[:maxsearch]: print 'starting % s' % bib epid = pparXiv(data[bib]['eprintid'], auth='arXiv') datasets = [] if epid != {}: wdir, sources = getSources(epid, locale=locale, diskroot=diskroot) content = [] data[bib]['sources'] = [] print sources for s in sources: f = os.path.join(wdir, s['file']) tcontent = processSource(f, type=s['type'], encoding=s['encoding'], action='read') if tcontent != []: data[bib]['sources'].append(f) content.extend(tcontent) datasets = searchSource(content, cmd='dataset') else: print 'no eprint' data[bib]['datasets'] = datasets if datasets != []: sets.append(data[bib]) print '\n' return sets, data