opts, args = getopt.getopt(sys.argv[1:], "") if len(args) > 3: raise getopt.GetoptError("Too many arguments given!!!") elif not args: raise getopt.GetoptError("Missing mandatory argument volume") except getopt.GetoptError as err: print(str(err)) # will print something like "option -a not recognized" print(usage) sys.exit(2) volume = args[0] publisher = 'International Society for Optics and Photonics' recs = spie(volume) if len(args) > 1: cnum = args[1] if len(args) > 2: fc = args[2] outfile = 'spie%s_%s.xml' % (volume, cnum) else: outfile = 'spie%s.xml' % (volume) dokf = codecs.EncodedFile(open(os.path.join(xmldir, outfile), mode='wb'), "utf8") #dokf = open(os.path.join(xmldir,outfile),'w') ejlmod2.writenewXML(recs, dokf, publisher, outfile[:-4]) dokf.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = outfile + "\n" if not line in retfiles_text: retfiles = open(retfiles_path, "a") retfiles.write(line) retfiles.close()
#supervisor if tdt in ['dc.contributor.supervisor', 'dc.contributor.advisor']: if td.text.strip(): rec['supervisor'].append([ re.sub(' \(.*', '', td.text.strip()) ]) #ORCID elif tdt == 'dc.identifier.orcid': if re.search('\d\d\d\d\-\d\d\d\d', td.text): rec['autaff'][-1].append('ORCID:' + re.sub('.*orcid.org\/+', '', td.text.strip())) #fulltext if 'pdf_url' in rec.keys(): if 'license' in rec.keys(): rec['FFT'] = rec['pdf_url'] else: rec['hidden'] = rec['pdf_url'] rec['autaff'][-1].append(publisher) print ' ', rec.keys() #closing of files and printing xmlf = os.path.join(xmldir, jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'),'utf8') ejlmod2.writenewXML(recs, xmlfile, publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path, "a") retfiles.write(line) retfiles.close()
unikey = str(len(recs)) j = 0 while unikey in dictrecs.keys() and len(dictrecs[unikey]) >= 150: j += 1 unikey = re.sub('\d', '', unikey) + str(j) if unikey in dictrecs.keys(): dictrecs[unikey].append(rec) else: dictrecs[unikey] = [rec] if len(recs) < 200: jnlfilename = 'THESES-NARCIS-%s' % (stampoftoday) #closing of files and printing xmlf = os.path.join(xmldir,jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8') ejlmod2.writenewXML(recs, xmlfile, publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path,"r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close() #if there are too many records, split by university else: for unikey in dictrecs.keys(): print unikey, len(dictrecs[unikey]) jnlfilename = 'THESES-NARCIS-%s-%s' % (stampoftoday, unikey) #closing of files and printing xmlf = os.path.join(xmldir,jnlfilename+'.xml')
iopftrunc, re.sub(' ', '', jnl[issn][0]), recs[0]['vol'], '.'.join(issues)) else: iopf = 'iop-%s-%s%s_%s' % ( iopftrunc, re.sub(' ', '', jnl[issn][0]), vol, '.'.join(issues)) else: iopf = 'iop-%s-%s%s_%s' % (iopftrunc, re.sub(' ', '', issn), vol, '.'.join(issues)) print ' ' if not issn in jnlskip.keys(): xmlf = os.path.join(xmldir, iopf + '.xml') xmlfile = codecs.EncodedFile( codecs.open(xmlf, mode='wb'), 'utf8') ejlmod2.writenewXML(recs, xmlfile, 'IOP', iopf) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = iopf + '.xml' + "\n" if not line in retfiles_text: retfiles = open(retfiles_path, "a") retfiles.write(line) retfiles.close() print '\n %s with %i records\n' % (iopf, len(recs)) #if everything went fine, move the files to done for datei in todo: os.system('mv %s/%s %s/%s' % (iopdirraw, datei, iopdirdone, datei)) shutil.rmtree(iopdirtmp)
timespan = int(sys.argv[1]) except: print '"%s" is not a number' % (timespan) sys.exit(2) else: timespan = 9 (pubdbrecords, jnlfilename) = requestarticles(timespan) records = translatearticles(pubdbrecords) #split too large xmls for tc in records.keys(): if len(tc) <= 1 and len(records[tc]) > chunksize: for i in range(0, len(records[tc]), chunksize): records[tc + str(i)] = records[tc][i:i + chunksize] del records[tc] for tc in records.keys(): if not tc or not tc[0] in ['T']: continue #write xml-file xmlf = os.path.join(xmldir, '%s.%s.xml' % (jnlfilename, tc)) xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8') ejlmod2.writenewXML(records[tc], xmlfile, publisher, xmlf[:-4]) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = "%s.%s.xml\n" % (jnlfilename, tc) if not line in retfiles_text: retfiles = open(retfiles_path, "a") retfiles.write(line) retfiles.close()
rec['FFT'] = dc2t #DOI for dc in oai.find_all('dc:relation'): if re.search('10\.18154', dc.text): rec['doi'] = re.sub('.*(10\.18154.*)', r'\1', dc.text) #article link for dc in oai.find_all('dc:identifier'): dct = dc.text if re.search('http...publications.rwth.aachen.de.record.\d+', dct): rec['artlink'] = dct if not 'doi' in rec.keys(): rec['doi'] = re.sub('.*\/', '20.2000/AACHEN/', dct) rec['link'] = dct if int(rec['year']) > now.year - years: recs[rec['doi']] = rec jnlfilename = 'THESES-AACHEN_%s' % (stampoftoday) #closing of files and printing xmlf = os.path.join(xmldir,jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8') ejlmod2.writenewXML(recs.values(),xmlfile,publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path,"r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close()
rec['refs'].append([('x', lit)]) #early access for strong in page.body.find_all('strong'): if re.search('is an early access version', strong.text): print 'skip early acccess version' else: recs.append(rec) print ' ', rec.keys() time.sleep(3) if (i % chunksize == 0) or (i == len(artlinks)): #write xml if recs: if i % chunksize == 0: xmlfilename = '%s-%02i_of_%i_of_%i.xml' % (jnlfilename, i/chunksize, numberofchunks, maxnumberofchunks) else: xmlfilename = '%s-fin_of_%i_of_%i.xml' % (jnlfilename, numberofchunks, maxnumberofchunks) xmlf = os.path.join(xmldir, xmlfilename) xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8') ejlmod2.writenewXML(recs, xmlfile, publisher, xmlfilename[:-4]) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = '%s\n' % (xmlfilename) print ' + wrote %s' % (line) if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close() recs = []
if unin.search(keyw): skipit = True print ' skip "%s"' % (keyw) break if not skipit: recs.append(rec) print ' ', rec.keys() for i in range(len(recs)/chunksize + 2): jnlfilename = 'THESES-OSTI-%s_%02i' % (stampoftoday, i) xmlf = os.path.join(xmldir,jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8') ejlmod2.writenewXML(recs[i*chunksize:(i+1)*chunksize],xmlfile,publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path,"r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close() sys.exit(0)
elif meta['name'] == 'citation_publisher': rec['publisher'] = meta['content'] #DOI elif meta['name'] == 'citation_doi': rec['doi'] = meta['content'] #license for a in artpage.find_all('a'): if a.has_attr('href') and re.search('creativecommons.org', a['href']): rec['license'] = {'url' : a['href']} if rec['publisher'] in publishers.keys(): publishers[rec['publisher']].append(rec) else: publishers[rec['publisher']] = [rec] print ' ', rec.keys() print ' ', [(s, len(publishers[s])) for s in publishers.keys()] for publisher in publishers.keys(): jnlfilename = 'oapen_%s.%s' % (stampoftoday, re.sub('\W', '', publisher)) #closing of files and printing xmlf = os.path.join(xmldir,jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8') ejlmod2.writenewXML(publishers[publisher], xmlfile, publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close()
section = '' #ISBN elif section == 'Identifiers' and gname == 'span': gtext = child.text.strip() if re.search('ISBN:', gtext): isbn = re.sub('.*(978.*?) .*', r'\1', re.sub('[\n\t]', '', gtext)) rec['isbn'] = re.sub('\-', '', isbn) section = '' elif gname == 'script': section = '' print rec time.sleep(10) #closing of files and printing xmlf = os.path.join(xmldir,jnlfilename+'.xml') xmlfile = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8') ejlmod2.writenewXML(categories[cate]['recs'],xmlfile,publisher, jnlfilename) xmlfile.close() #retrival retfiles_text = open(retfiles_path,"r").read() line = jnlfilename+'.xml'+ "\n" if not line in retfiles_text: retfiles = open(retfiles_path,"a") retfiles.write(line) retfiles.close()
print ' ', rec.keys() recs.append(rec) #copy pdf pdffilename = re.sub('xml$', 'pdf', rec['artfilename']) if os.path.isfile(pdffilename): doi1 = re.sub('[\(\)\/]', '_', rec['doi']) os.system('mv %s %s/%s.pdf' % (pdffilename, pdfpath, doi1)) print ' copied pdf file' #write to disc numofchunks = (len(recs) - 1) / chunksize + 1 for chunk in range(numofchunks): xmlfilename = '%s-%02i_of_%i.xml' % (jnlfilename, chunk + 1, numofchunks) xmlf = os.path.join(xmldir, xmlfilename) xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8') ejlmod2.writenewXML(recs[chunk * chunksize:(chunk + 1) * chunksize], xmlfile, publisher, xmlfilename[:-4]) xmlfile.close() #retrival retfiles_text = open(retfiles_path, "r").read() line = '%s\n' % (xmlfilename) print ' + wrote %s' % (line) if not line in retfiles_text: retfiles = open(retfiles_path, "a") retfiles.write(line) retfiles.close() #cleanup for rec in prerecs: donefilename = re.sub('\/tmp', '\/done\/' + jnl, rec['artfilename']) targetdir = re.sub('(.*)\/.*', r'\1', donefilename) if not os.path.isdir(targetdir):