def get_pmh_record(self, record_id): my_sickle = _get_my_sickle(self.pmh_url) pmh_input_record = my_sickle.GetRecord(identifier=record_id, metadataPrefix=self.metadata_prefix) my_pmh_record = pmh_record.PmhRecord() my_pmh_record.populate(self.id, pmh_input_record, metadata_prefix=self.metadata_prefix) my_pmh_record.repo_id = self.id_old # delete once endpoint_id is populated return my_pmh_record
def get_pmh_record(self, record_id): my_sickle = self.get_my_sickle(self.pmh_url) pmh_input_record = my_sickle.GetRecord(identifier=record_id, metadataPrefix="oai_dc") my_pmh_record = pmh_record.PmhRecord() my_pmh_record.populate(pmh_input_record) my_pmh_record.repo_id = self.id return my_pmh_record
def call_pmh_endpoint(self, first=None, last=None, chunk_size=10, scrape=False): args = {} args['metadataPrefix'] = 'oai_dc' if "citeseerx" in self.pmh_url: proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url, "http": proxy_url} else: proxies = {} my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120) logger.info(u"connected to sickle with {} {}".format( self.pmh_url, proxies)) args['from'] = first if last: args["until"] = last records_to_save = [] logger.info(u"calling ListRecords with {} {}".format( self.pmh_url, args)) try: pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args) logger.info(u"got pmh_records with {} {}".format( self.pmh_url, args)) pmh_input_record = safe_get_next_record(pmh_records) except Exception as e: logger.info(u"no records with {} {}".format(self.pmh_url, args)) # logger.exception(u"no records with {} {}".format(self.pmh_url, args)) pmh_input_record = None while pmh_input_record: my_pmh_record = pmh_record.PmhRecord() my_pmh_record.id = pmh_input_record.header.identifier my_pmh_record.api_raw = pmh_input_record.raw my_pmh_record.record_timestamp = pmh_input_record.header.datestamp my_pmh_record.title = oai_tag_match("title", pmh_input_record) my_pmh_record.authors = oai_tag_match("creator", pmh_input_record, return_list=True) my_pmh_record.oa = oai_tag_match("oa", pmh_input_record) my_pmh_record.urls = oai_tag_match("identifier", pmh_input_record, return_list=True) for fulltext_url in my_pmh_record.urls: if fulltext_url and (is_doi_url(fulltext_url) or fulltext_url.startswith(u"doi:") or re.findall(u"10\.", fulltext_url)): try: my_pmh_record.doi = clean_doi(fulltext_url) except NoDoiException: pass my_pmh_record.license = oai_tag_match("rights", pmh_input_record) my_pmh_record.relations = oai_tag_match("relation", pmh_input_record, return_list=True) my_pmh_record.sources = oai_tag_match("collname", pmh_input_record, return_list=True) my_pmh_record.source = self.id if is_complete(my_pmh_record): db.session.merge(my_pmh_record) my_pages = my_pmh_record.mint_pages() logger.info(u"made {} pages for id {}".format( len(my_pages), my_pmh_record.id)) for my_page in my_pages: if scrape: logger.info(u"scraping pages") my_page.scrape() db.session.merge(my_page) records_to_save.append(my_pmh_record) # logger.info(u":") logger.info(u"my_pmh_record {}".format( my_pmh_record.get_good_urls())) else: logger.info(u"not complete") if len(records_to_save) >= chunk_size: last_record = records_to_save[-1] logger.info(u"last record saved: {} for {}".format( last_record.id, self.id)) safe_commit(db) records_to_save = [] pmh_input_record = safe_get_next_record(pmh_records) # make sure to get the last ones if records_to_save: last_record = records_to_save[-1] logger.info( u"saving {} last ones, last record saved: {} for {}".format( len(records_to_save), last_record.id, self.id)) safe_commit(db) logger.info(u"done everything for {}".format(self.id))
def call_pmh_endpoint(self, first=None, last=None, chunk_size=50, scrape=False): start_time = time() records_to_save = [] num_records_updated = 0 loop_counter = 0 self.error = None (pmh_input_record, pmh_records, error) = self.get_pmh_input_record(first, last) if error: self.error = u"error in get_pmh_input_record: {}".format(error) return while pmh_input_record: loop_counter += 1 # create the record my_pmh_record = pmh_record.PmhRecord() # set its vars my_pmh_record.repo_id = self.id_old # delete once endpoint_ids are all populated my_pmh_record.endpoint_id = self.id my_pmh_record.rand = random() my_pmh_record.populate(pmh_input_record) if is_complete(my_pmh_record): my_pages = my_pmh_record.mint_pages() my_pmh_record.pages = my_pages if scrape: for my_page in my_pages: my_page.scrape_if_matches_pub() records_to_save.append(my_pmh_record) db.session.merge(my_pmh_record) else: logger.info(u"pmh record is not complete") # print my_pmh_record pass if len(records_to_save) >= chunk_size: num_records_updated += len(records_to_save) safe_commit(db) records_to_save = [] if loop_counter % 100 == 0: logger.info( u"iterated through 100 more items, loop_counter={} for {}". format(loop_counter, self.id)) pmh_input_record = self.safe_get_next_record(pmh_records) # make sure to get the last ones if records_to_save: num_records_updated += len(records_to_save) last_record = records_to_save[-1] logger.info( u"saving {} last ones, last record saved: {} for {}, loop_counter={}" .format(len(records_to_save), last_record.id, self.id, loop_counter)) safe_commit(db) else: logger.info( u"finished loop, but no records to save, loop_counter={}". format(loop_counter)) logger.info( u"updated {} PMH records for endpoint_id={}, took {} seconds". format(num_records_updated, self.id, elapsed(start_time, 2)))