def get_items(self, nmats=2, dois=None, materials=None): """DOIs + Materials iterator :param nmats: number of materials for which to request DOIs :type nmats: int :param dois: 'dois' collection in 'mg_core_dev/prod' :type dois: QueryEngine :param materials: 'materials' collection in 'mg_core_dev/prod' :type materials: QueryEngine """ self.osti_record = OstiRecord( n=nmats, doicoll=dois.collection, matcoll=materials.collection ) self.osti_record.submit() self.doi_qe = dois self.mat_qe = materials self.headers = {'Accept': 'text/bibliography; style=bibtex'} # loop the mp-id's # w/o valid DOI in doicoll *OR* # w/ valid DOI in doicoll but w/o doi key in matcoll mp_ids = [ {'_id': doc['_id'], 'doi': doc['doi'], 'valid': False} for doc in self.doi_qe.collection.find({'valid': False}) ] valid_mp_ids = self.doi_qe.collection.find({'valid': True}).distinct('_id') missing_mp_ids = self.mat_qe.collection.find( {'task_id': {'$in': valid_mp_ids}, 'doi': {'$exists': False}}, {'_id': 0, 'task_id': 1} ).distinct('task_id') mp_ids += list(self.doi_qe.collection.find( {'_id': {'$in': missing_mp_ids}}, {'doi': 1, 'valid': 1, 'bibtex': 1} )) return mp_ids
def finalize(self, errors): osti_record = OstiRecord(n=self.nmats, doicoll=self.doi_qe.collection, matcoll=self.mat_qe.collection) osti_record.submit() with open(backupfile, 'w') as outfile: l = list( self.doi_qe.collection.find(fields={ 'created_at': True, 'doi': True })) json.dump(l, outfile, indent=2) # push results to plotly streaming graph counts = [ self.mat_qe.collection.count(), self.doi_qe.collection.count(), len(osti_record.matad.get_all_dois()) ] for idx, stream_id in enumerate(stream_ids): s = py.Stream(stream_id) s.open() s.write(dict(x=now, y=counts[idx])) s.close() return True
def finalize(self, errors): osti_record = OstiRecord( n=self.nmats, doicoll=self.doi_qe.collection, matcoll=self.mat_qe.collection ) osti_record.submit() with open(backupfile, 'w') as outfile: l = list(self.doi_qe.collection.find( fields={'created_at': True, 'doi': True} )) json.dump(l, outfile, indent=2) # push results to plotly streaming graph counts = [ self.mat_qe.collection.count(), self.doi_qe.collection.count(), len(osti_record.matad.get_all_dois()) ] for idx,stream_id in enumerate(stream_ids): s = py.Stream(stream_id) s.open() s.write(dict(x=now, y=counts[idx])) s.close() return True
matad.matcoll.count()) elif args.plotly: import os, datetime import plotly.plotly as py from plotly.graph_objs import * stream_ids = ['645h22ynck', '96howh4ip8', 'nnqpv5ra02'] py.sign_in(os.environ.get('MP_PLOTLY_USER'), os.environ.get('MP_PLOTLY_APIKEY'), stream_ids=stream_ids) today = datetime.date.today() counts = [ matad.matcoll.count(), matad.doicoll.count(), len(matad.get_all_dois()) ] names = ['materials', 'requested DOIs', 'validated DOIs'] data = Data([ Scatter(x=[today], y=[counts[idx]], name=names[idx], stream=dict(token=stream_ids[idx], maxpoints=10000)) for idx, count in enumerate(counts) ]) filename = 'dois_{}'.format(today) print py.plot(data, filename=filename, auto_open=False) else: # generate records for either n or all (n=0) not-yet-submitted materials # OR generate records for specific materials (submitted or not) osti = OstiRecord(l=args.l, n=args.n, db_yaml=db_yaml) osti.submit()
class DoiBuilder(Builder): """Builder to obtain DOIs for all/new materials""" def get_items(self, nmats=2, dois=None, materials=None): """DOIs + Materials iterator :param nmats: number of materials for which to request DOIs :type nmats: int :param dois: 'dois' collection in 'mg_core_dev/prod' :type dois: QueryEngine :param materials: 'materials' collection in 'mg_core_dev/prod' :type materials: QueryEngine """ self.osti_record = OstiRecord( n=nmats, doicoll=dois.collection, matcoll=materials.collection ) self.osti_record.submit() self.doi_qe = dois self.mat_qe = materials self.headers = {'Accept': 'text/bibliography; style=bibtex'} # loop the mp-id's # w/o valid DOI in doicoll *OR* # w/ valid DOI in doicoll but w/o doi key in matcoll mp_ids = [ {'_id': doc['_id'], 'doi': doc['doi'], 'valid': False} for doc in self.doi_qe.collection.find({'valid': False}) ] valid_mp_ids = self.doi_qe.collection.find({'valid': True}).distinct('_id') missing_mp_ids = self.mat_qe.collection.find( {'task_id': {'$in': valid_mp_ids}, 'doi': {'$exists': False}}, {'_id': 0, 'task_id': 1} ).distinct('task_id') mp_ids += list(self.doi_qe.collection.find( {'_id': {'$in': missing_mp_ids}}, {'doi': 1, 'valid': 1, 'bibtex': 1} )) return mp_ids def process_item(self, item): """validate DOI, save bibtex and build into matcoll""" if not item['valid']: #doi_url = 'http://doi.org/{}'.format(item['doi']) #doi_url = 'http://dx.doi.org/10.1038/nrd842' #r = requests.get(doi_url, headers=self.headers) osti_id = item['doi'].split('/')[-1] doi_url = 'http://www.osti.gov/dataexplorer/biblio/{}/cite/bibtex'.format(osti_id) try: r = requests.get(doi_url) except Exception as ex: _log.warning('validation exception: {} -> {} -> {}'.format( item['_id'], item['doi'], ex )) return 0 _log.info('validate {} -> {} -> {}'.format(item['_id'], item['doi'], r.status_code)) if r.status_code == 200: soup = BeautifulSoup(r.content, "html.parser") rows = soup.find_all('div', attrs={"class" : "csl-entry"}) if len(rows) == 1: bibtex = rows[0].text.strip() _log.info(self.doi_qe.collection.update( {'_id': item['_id']}, {'$set': { 'valid': True, 'bibtex': bibtex }} )) # only validated DOIs are ready to be built into matcoll _log.info(self.mat_qe.collection.update( {'task_id': item['_id']}, {'$set': { 'doi': item['doi'], 'doi_bibtex': bibtex }} )) else: _log.info('re-build {} -> {}'.format(item['_id'], item['doi'])) _log.info(self.mat_qe.collection.update( {'task_id': item['_id']}, {'$set': { 'doi': item['doi'], 'doi_bibtex': item['bibtex'] }} )) def finalize(self, errors): filepath = os.path.join(dirname, 'dois.json') with open(filepath, 'w') as outfile: l = list(self.doi_qe.collection.find( fields={'created_at': True, 'doi': True} )) json.dump(l, outfile, indent=2) # push results to plotly streaming graph counts = [ self.mat_qe.collection.count(), self.doi_qe.collection.count(), len(self.osti_record.matad.get_all_dois()) ] for idx,stream_id in enumerate(stream_ids): s = py.Stream(stream_id) s.open() s.write(dict(x=today, y=counts[idx])) s.close() return True
print '{}/{} materials have DOIs.'.format(len(dois), matad.matcoll.count()) elif args.plotly: import os, datetime import plotly.plotly as py from plotly.graph_objs import * stream_ids = ['645h22ynck', '96howh4ip8', 'nnqpv5ra02'] py.sign_in( os.environ.get('MP_PLOTLY_USER'), os.environ.get('MP_PLOTLY_APIKEY'), stream_ids=stream_ids ) today = datetime.date.today() counts = [ matad.matcoll.count(), matad.doicoll.count(), len(matad.get_all_dois()) ] names = ['materials', 'requested DOIs', 'validated DOIs'] data = Data([ Scatter( x=[today], y=[counts[idx]], name=names[idx], stream=dict(token=stream_ids[idx], maxpoints=10000) ) for idx,count in enumerate(counts) ]) filename = 'dois_{}'.format(today) print py.plot(data, filename=filename, auto_open=False) else: # generate records for either n or all (n=0) not-yet-submitted materials # OR generate records for specific materials (submitted or not) osti = OstiRecord(l=args.l, n=args.n, db_yaml=db_yaml) osti.submit()