def etl_hypothesis_annotations(searchurl, last_update=""): etl = ETL() etl.read_configfile ('/etc/etl/config') etl.read_configfile ('/etc/opensemanticsearch/etl') etl.read_configfile ('/etc/opensemanticsearch/hypothesis') etl.verbose = verbose if verbose: print ( "Get from hypothesis API {}".format(searchurl) ) request = requests.get(searchurl) result = json.loads(request.content.decode('utf-8')) parameters = {} parameters['plugins'] = [] # since there can be multiple annotations for same URI, # do not overwrite but add value to existent values of the facet/field/property parameters['add'] = True newest_update = last_update stat_downloaded_annotations = 0 stat_imported_annotations = 0 for annotation in result['rows']: stat_downloaded_annotations += 1 if annotation['updated'] > last_update: stat_imported_annotations += 1 # save update time from newest annotation/edit if annotation['updated'] > newest_update: newest_update = annotation['updated'] data = {} # id/uri of the annotated document, not the annotation id parameters['id'] = annotation['uri'] # first index / etl the webpage / document that has been annotated if not yet in index result = etl_document(uri=annotation['uri']) if not result: data['etl_error_hypothesis_ss']="Error while indexing the document that has been annotated" # annotation id data['annotation_id_ss'] = annotation['id'] data['annotation_text_tt'] = annotation['text'] tags = [] if 'tags' in annotation: for tag in annotation['tags']: tags.append(tag) data['annotation_tag_ss'] = tags # write annotation to database or index etl.process(parameters=parameters, data=data) etl.commit() if verbose: print ("Downloaded annotations: {}".format(stat_downloaded_annotations)) print ("Imported new annotations: {}".format(stat_imported_annotations)) return newest_update