def get_sickle(): """ Return a sickle OAI harvester for PMC """ import sickle return sickle.Sickle(endpoint=endpoint)
def fetch_date(self, date): api = sickle.Sickle(self.endpoint_url) date_str = date.isoformat() produce_topic = self.kafka.topics[self.produce_topic] # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs try: records = api.ListRecords( **{ 'metadataPrefix': self.metadata_prefix, 'from': date_str, 'until': date_str, }) except sickle.oaiexceptions.NoRecordsMatch: print("WARN: no OAI-PMH records for this date: {} (UTC)".format( date_str)) return count = 0 with produce_topic.get_producer() as producer: for item in records: count += 1 if count % 50 == 0: print("... up to {}".format(count)) producer.produce( item.raw.encode('utf-8'), partition_key=item.header.identifier.encode('utf-8'))
def __init__(self, oai_url="http://export.arxiv.org/oai2", metadata_format='arXivRaw'): self.metadata_format = metadata_format self.arxiv: OAIItemIterator = sickle.Sickle(oai_url, iterator=OAIItemIterator) print( f"*** extracting metadata from {oai_url} in {metadata_format} format ***" )
def getSickle(url): """ Create a Sickle instance Args: url: OAI-PMH service URL Returns: sickle.Sickle instance """ return sickle.Sickle(url, encoding=DEFAULT_ENCODING)
def fetch_date(self, date: datetime.date) -> None: def fail_fast(err: Any, _msg: Any) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) producer_conf = self.kafka_config.copy() producer_conf.update({ "delivery.report.only.error": True, "default.topic.config": { "request.required.acks": -1, # all brokers must confirm }, }) producer = Producer(producer_conf) api = sickle.Sickle(self.endpoint_url, max_retries=5, retry_status_codes=[503]) date_str = date.isoformat() # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs try: records = api.ListRecords( **{ "metadataPrefix": self.metadata_prefix, "from": date_str, "until": date_str, }) except sickle.oaiexceptions.NoRecordsMatch: print( "WARN: no OAI-PMH records for this date: {} (UTC)".format( date_str), file=sys.stderr, ) return count = 0 for item in records: count += 1 if count % 50 == 0: print("... up to {}".format(count), file=sys.stderr) producer.produce( self.produce_topic, item.raw.encode("utf-8"), key=item.header.identifier.encode("utf-8"), on_delivery=fail_fast, ) producer.flush()
def oaipmh_to_elastic(start_date, end_date=None, threads=0, chunk_size=None, url=None): es = set_up_elastic(url) proxy_url = os.getenv("STATIC_IP_PROXY") proxies = {"https": proxy_url, "http": proxy_url} base_sickle = sickle.Sickle("http://oai.base-search.net/oai", proxies=proxies) args = {'metadataPrefix': 'base_dc', 'from': start_date} if end_date: args["until"] = end_date oai_records = base_sickle.ListRecords(ignore_deleted=True, **args) records_to_save = [] print 'chunk_size', chunk_size oai_record = safe_get_next_record(oai_records) while oai_record: record = {} record["id"] = oai_record.header.identifier record["base_timestamp"] = oai_record.header.datestamp record["added_timestamp"] = datetime.datetime.utcnow().isoformat() record["title"] = oai_tag_match("title", oai_record) record["license"] = oai_tag_match("rights", oai_record) try: record["oa"] = int(oai_tag_match("oa", oai_record)) except TypeError: record["oa"] = 0 record["urls"] = oai_tag_match("identifier", oai_record, return_list=True) record["authors"] = oai_tag_match("creator", oai_record, return_list=True) record["relations"] = oai_tag_match("relation", oai_record, return_list=True) record["sources"] = oai_tag_match("collname", oai_record, return_list=True) if is_complete(record): action_record = make_record_for_es(record) records_to_save.append(action_record) print ":", else: print ".", if len(records_to_save) >= 1000: save_records_in_es(es, records_to_save, threads, chunk_size) print "last record saved:", records_to_save[-1] print "last timestamp saved:", records_to_save[-1]["base_timestamp"] records_to_save = [] oai_record = safe_get_next_record(oai_records) # make sure to get the last ones if records_to_save: save_records_in_es(es, records_to_save, 1, chunk_size) print "last record saved:", records_to_save[-1]
def fetch_date(self, date): def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) producer_conf = self.kafka_config.copy() producer_conf.update({ 'delivery.report.only.error': True, 'default.topic.config': { 'request.required.acks': -1, # all brokers must confirm }, }) producer = Producer(producer_conf) api = sickle.Sickle(self.endpoint_url) date_str = date.isoformat() # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs try: records = api.ListRecords( **{ 'metadataPrefix': self.metadata_prefix, 'from': date_str, 'until': date_str, }) except sickle.oaiexceptions.NoRecordsMatch: print("WARN: no OAI-PMH records for this date: {} (UTC)".format( date_str)) return count = 0 for item in records: count += 1 if count % 50 == 0: print("... up to {}".format(count)) producer.produce(self.produce_topic, item.raw.encode('utf-8'), key=item.header.identifier.encode('utf-8'), on_delivery=fail_fast) producer.flush()