Пример #1
0
        opts, args = getopt.getopt(sys.argv[1:], "")
        if len(args) > 3:
            raise getopt.GetoptError("Too many arguments given!!!")
        elif not args:
            raise getopt.GetoptError("Missing mandatory argument volume")
    except getopt.GetoptError as err:
        print(str(err))  # will print something like "option -a not recognized"
        print(usage)
        sys.exit(2)
    volume = args[0]
    publisher = 'International Society for Optics and Photonics'
    recs = spie(volume)
    if len(args) > 1:
        cnum = args[1]
        if len(args) > 2: fc = args[2]
        outfile = 'spie%s_%s.xml' % (volume, cnum)
    else:
        outfile = 'spie%s.xml' % (volume)
    dokf = codecs.EncodedFile(open(os.path.join(xmldir, outfile), mode='wb'),
                              "utf8")
    #dokf = open(os.path.join(xmldir,outfile),'w')
    ejlmod2.writenewXML(recs, dokf, publisher, outfile[:-4])
    dokf.close()
    #retrival
    retfiles_text = open(retfiles_path, "r").read()
    line = outfile + "\n"
    if not line in retfiles_text:
        retfiles = open(retfiles_path, "a")
        retfiles.write(line)
        retfiles.close()
Пример #2
0
            #supervisor
            if tdt in ['dc.contributor.supervisor', 'dc.contributor.advisor']:
                if td.text.strip():
                    rec['supervisor'].append([ re.sub(' \(.*', '', td.text.strip()) ])
            #ORCID
            elif tdt == 'dc.identifier.orcid':
                if re.search('\d\d\d\d\-\d\d\d\d', td.text):
                    rec['autaff'][-1].append('ORCID:' + re.sub('.*orcid.org\/+', '', td.text.strip()))
    #fulltext
    if 'pdf_url' in rec.keys():
        if 'license' in rec.keys():
            rec['FFT'] = rec['pdf_url']
        else:
            rec['hidden'] = rec['pdf_url']
    rec['autaff'][-1].append(publisher)
            
    print '  ', rec.keys()
                
#closing of files and printing
xmlf = os.path.join(xmldir, jnlfilename+'.xml')
xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'),'utf8')
ejlmod2.writenewXML(recs, xmlfile, publisher, jnlfilename)
xmlfile.close()
#retrival
retfiles_text = open(retfiles_path, "r").read()
line = jnlfilename+'.xml'+ "\n"
if not line in retfiles_text: 
    retfiles = open(retfiles_path, "a")
    retfiles.write(line)
    retfiles.close()
Пример #3
0
        unikey = str(len(recs))
        j = 0
        while unikey in dictrecs.keys() and len(dictrecs[unikey]) >= 150:
            j += 1
            unikey = re.sub('\d', '', unikey) + str(j)
        if unikey in dictrecs.keys():
            dictrecs[unikey].append(rec)
        else:
            dictrecs[unikey] = [rec]

if len(recs) < 200:
    jnlfilename = 'THESES-NARCIS-%s' % (stampoftoday)
    #closing of files and printing
    xmlf    = os.path.join(xmldir,jnlfilename+'.xml')
    xmlfile  = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8')
    ejlmod2.writenewXML(recs, xmlfile, publisher, jnlfilename)
    xmlfile.close()
    #retrival
    retfiles_text = open(retfiles_path,"r").read()
    line = jnlfilename+'.xml'+ "\n"
    if not line in retfiles_text: 
        retfiles = open(retfiles_path,"a")
        retfiles.write(line)
        retfiles.close()
#if there are too many records, split by university
else:
    for unikey in dictrecs.keys():
        print unikey, len(dictrecs[unikey])
        jnlfilename = 'THESES-NARCIS-%s-%s' % (stampoftoday, unikey)
        #closing of files and printing
        xmlf    = os.path.join(xmldir,jnlfilename+'.xml')
Пример #4
0
                                iopftrunc, re.sub(' ', '', jnl[issn][0]),
                                recs[0]['vol'], '.'.join(issues))
                        else:
                            iopf = 'iop-%s-%s%s_%s' % (
                                iopftrunc, re.sub(' ', '', jnl[issn][0]), vol,
                                '.'.join(issues))
                    else:
                        iopf = 'iop-%s-%s%s_%s' % (iopftrunc,
                                                   re.sub(' ', '', issn), vol,
                                                   '.'.join(issues))
                    print ' '
                    if not issn in jnlskip.keys():
                        xmlf = os.path.join(xmldir, iopf + '.xml')
                        xmlfile = codecs.EncodedFile(
                            codecs.open(xmlf, mode='wb'), 'utf8')
                        ejlmod2.writenewXML(recs, xmlfile, 'IOP', iopf)
                        xmlfile.close()

                        #retrival
                        retfiles_text = open(retfiles_path, "r").read()
                        line = iopf + '.xml' + "\n"
                        if not line in retfiles_text:
                            retfiles = open(retfiles_path, "a")
                            retfiles.write(line)
                            retfiles.close()
                print '\n   %s with %i records\n' % (iopf, len(recs))

#if everything went fine, move the files to done
for datei in todo:
    os.system('mv %s/%s %s/%s' % (iopdirraw, datei, iopdirdone, datei))
shutil.rmtree(iopdirtmp)
Пример #5
0
            timespan = int(sys.argv[1])
        except:
            print '"%s" is not a number' % (timespan)
            sys.exit(2)
    else:
        timespan = 9

    (pubdbrecords, jnlfilename) = requestarticles(timespan)
    records = translatearticles(pubdbrecords)
    #split too large xmls
    for tc in records.keys():
        if len(tc) <= 1 and len(records[tc]) > chunksize:
            for i in range(0, len(records[tc]), chunksize):
                records[tc + str(i)] = records[tc][i:i + chunksize]
            del records[tc]
    for tc in records.keys():
        if not tc or not tc[0] in ['T']:
            continue
        #write xml-file
        xmlf = os.path.join(xmldir, '%s.%s.xml' % (jnlfilename, tc))
        xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8')
        ejlmod2.writenewXML(records[tc], xmlfile, publisher, xmlf[:-4])
        xmlfile.close()
        #retrival
        retfiles_text = open(retfiles_path, "r").read()
        line = "%s.%s.xml\n" % (jnlfilename, tc)
        if not line in retfiles_text:
            retfiles = open(retfiles_path, "a")
            retfiles.write(line)
            retfiles.close()
Пример #6
0
                        rec['FFT'] = dc2t
        #DOI
        for dc in oai.find_all('dc:relation'):
            if re.search('10\.18154', dc.text):
                rec['doi'] = re.sub('.*(10\.18154.*)', r'\1', dc.text)
        #article link
        for dc in oai.find_all('dc:identifier'):
            dct = dc.text
            if re.search('http...publications.rwth.aachen.de.record.\d+', dct):
                rec['artlink'] = dct
            if not 'doi' in rec.keys():
                rec['doi'] = re.sub('.*\/', '20.2000/AACHEN/', dct)
                rec['link'] = dct
        if int(rec['year']) > now.year - years:
            recs[rec['doi']] = rec

jnlfilename = 'THESES-AACHEN_%s' % (stampoftoday)

#closing of files and printing
xmlf = os.path.join(xmldir,jnlfilename+'.xml')
xmlfile  = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8')
ejlmod2.writenewXML(recs.values(),xmlfile,publisher, jnlfilename)
xmlfile.close()
#retrival
retfiles_text = open(retfiles_path,"r").read()
line = jnlfilename+'.xml'+ "\n"
if not line in retfiles_text: 
    retfiles = open(retfiles_path,"a")
    retfiles.write(line)
    retfiles.close()
Пример #7
0
                rec['refs'].append([('x', lit)])
        #early access
        for strong in page.body.find_all('strong'):
            if re.search('is an early access version', strong.text):
                print 'skip early acccess version'
        else:
            recs.append(rec)
            print '  ', rec.keys()
        time.sleep(3)
    if (i % chunksize == 0) or (i == len(artlinks)):
        #write xml
        if recs:
            if i % chunksize == 0:
                xmlfilename = '%s-%02i_of_%i_of_%i.xml' % (jnlfilename, i/chunksize, numberofchunks, maxnumberofchunks)
            else:
                xmlfilename = '%s-fin_of_%i_of_%i.xml' % (jnlfilename, numberofchunks, maxnumberofchunks)
            xmlf = os.path.join(xmldir, xmlfilename)
            xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8')
            ejlmod2.writenewXML(recs, xmlfile, publisher, xmlfilename[:-4])
            xmlfile.close()
            #retrival
            retfiles_text = open(retfiles_path, "r").read()
            line = '%s\n' % (xmlfilename)
            print ' + wrote %s' % (line)
            if not line in retfiles_text:
                retfiles = open(retfiles_path,"a")
                retfiles.write(line)
                retfiles.close()
        recs = []
            
Пример #8
0
                if unin.search(keyw):
                    skipit = True
                    print '  skip "%s"' % (keyw)
                    break
    if not skipit:
        recs.append(rec)
        print '  ', rec.keys()



for i in range(len(recs)/chunksize + 2):
    jnlfilename = 'THESES-OSTI-%s_%02i' % (stampoftoday, i)
    
    xmlf    = os.path.join(xmldir,jnlfilename+'.xml')
    xmlfile  = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8')
    ejlmod2.writenewXML(recs[i*chunksize:(i+1)*chunksize],xmlfile,publisher, jnlfilename)
    xmlfile.close()
    #retrival
    retfiles_text = open(retfiles_path,"r").read()
    line = jnlfilename+'.xml'+ "\n"
    if not line in retfiles_text: 
        retfiles = open(retfiles_path,"a")
        retfiles.write(line)
        retfiles.close()
    





sys.exit(0)
Пример #9
0
            elif meta['name'] == 'citation_publisher':
                rec['publisher'] = meta['content']
            #DOI
            elif meta['name'] == 'citation_doi':
                rec['doi'] = meta['content']
    #license
    for a in artpage.find_all('a'):
        if a.has_attr('href') and re.search('creativecommons.org', a['href']):
            rec['license'] = {'url' : a['href']}
    if rec['publisher'] in publishers.keys():
        publishers[rec['publisher']].append(rec)
    else:
        publishers[rec['publisher']] = [rec]
    print '   ', rec.keys()
    print '    ', [(s, len(publishers[s])) for s in publishers.keys()]

for publisher in publishers.keys():
    jnlfilename = 'oapen_%s.%s' % (stampoftoday, re.sub('\W', '', publisher))
    #closing of files and printing
    xmlf = os.path.join(xmldir,jnlfilename+'.xml')
    xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8')
    ejlmod2.writenewXML(publishers[publisher], xmlfile, publisher, jnlfilename)
    xmlfile.close()
    #retrival
    retfiles_text = open(retfiles_path, "r").read()
    line = jnlfilename+'.xml'+ "\n"
    if not line in retfiles_text:
        retfiles = open(retfiles_path,"a")
        retfiles.write(line)
        retfiles.close()
Пример #10
0
                            section = ''
                        #ISBN
                        elif section == 'Identifiers' and gname == 'span':
                            gtext = child.text.strip()
                            if re.search('ISBN:', gtext):
                                isbn = re.sub('.*(978.*?) .*', r'\1', re.sub('[\n\t]', '', gtext))
                                rec['isbn'] = re.sub('\-', '', isbn)
                                section = ''
                        elif gname == 'script':
                            section = ''
                                
        print rec
        time.sleep(10)




                
    #closing of files and printing
    xmlf    = os.path.join(xmldir,jnlfilename+'.xml')
    xmlfile  = codecs.EncodedFile(codecs.open(xmlf,mode='wb'),'utf8')
    ejlmod2.writenewXML(categories[cate]['recs'],xmlfile,publisher, jnlfilename)
    xmlfile.close()
    #retrival
    retfiles_text = open(retfiles_path,"r").read()
    line = jnlfilename+'.xml'+ "\n"
    if not line in retfiles_text: 
        retfiles = open(retfiles_path,"a")
        retfiles.write(line)
        retfiles.close()
Пример #11
0
            print '  ', rec.keys()
            recs.append(rec)
            #copy pdf
            pdffilename = re.sub('xml$', 'pdf', rec['artfilename'])
            if os.path.isfile(pdffilename):
                doi1 = re.sub('[\(\)\/]', '_', rec['doi'])
                os.system('mv %s %s/%s.pdf' % (pdffilename, pdfpath, doi1))
            print '    copied pdf file'

#write to disc
numofchunks = (len(recs) - 1) / chunksize + 1
for chunk in range(numofchunks):
    xmlfilename = '%s-%02i_of_%i.xml' % (jnlfilename, chunk + 1, numofchunks)
    xmlf = os.path.join(xmldir, xmlfilename)
    xmlfile = codecs.EncodedFile(codecs.open(xmlf, mode='wb'), 'utf8')
    ejlmod2.writenewXML(recs[chunk * chunksize:(chunk + 1) * chunksize],
                        xmlfile, publisher, xmlfilename[:-4])
    xmlfile.close()
    #retrival
    retfiles_text = open(retfiles_path, "r").read()
    line = '%s\n' % (xmlfilename)
    print ' + wrote %s' % (line)
    if not line in retfiles_text:
        retfiles = open(retfiles_path, "a")
        retfiles.write(line)
        retfiles.close()

#cleanup
for rec in prerecs:
    donefilename = re.sub('\/tmp', '\/done\/' + jnl, rec['artfilename'])
    targetdir = re.sub('(.*)\/.*', r'\1', donefilename)
    if not os.path.isdir(targetdir):