class Rdfxml2Es: def __init__(self, file, frame, host, port, esindex, indctrl, bulksize, devmode, filemode, outsubDir): """ 1) Initializes some attributes 2) Checks if connection to ES node can be established 3) Checks if ES index does not already exist 4) If 2) und 3) are true, then create index and type mappings :param file: The RDF-XML file :param frame: File containing the JSON-LD framing :param host: Host of ES node :param port: Port of ES node :param esindex: Name of ES index :param indctrl: Settings and mapping for ES :param bulksize: Size of bulk uploads :param devmode: Number of samples for performing performance :param filemode: :param outsubDir: test on different bulk upload sizes :return: None """ self.file = file self.frame = frame self.host = host self.port = port self.index = esindex self.indctrl = indctrl self.bulksize = bulksize self.bulknum = 0 self.devmode = devmode self.filemode = filemode self.esdocs = list() self.outsubDir = outsubDir self.numberOfFilesInSubDir = 300 self.openedFilesInSubDir = 0 self.currentSubDir = 1 self.writtenDocuments = 0 if self.devmode > 0: self.doccounter = 0 if self.filemode: self._openFile() #self.of = open('output.json', 'w') else: try: h1 = client.HTTPConnection(self.host, self.port) h1.connect() h1.close() self.of = Elasticsearch([{ 'host': self.host, 'port': self.port }]) if not self.of.indices.exists(self.index) is True: if self.indctrl is not None: self.of.indices.create(index=self.index, body=self.loadjson( self.indctrl)) else: self.of.indices.create(index=self.index) except Exception as inst: exit("Error: " + inst.args[1]) @staticmethod def loadjson(ifile): """ Loads a file containing valid JSON-LD objects and removes comments :param ifile: :return: Object of type Dictionary """ with open(ifile, 'r') as f: raw = f.read() jsonstr = jsmin(raw) return loads(jsonstr) @staticmethod def stripchars(string): """ Removes tabs and newlines from string. :param string: :return: Cleaned string """ return ''.join(re.split('\t+|\n+', string)) def parsexml(self): """ Parses XML and kicks off the transformation and indexing of the individual documents. Must be implemented in child classes :return: None """ raise NotImplementedError def rdf2es(self, string, bibo): """ Does the really interesting stuff: Transformation of the triples by subject and indexing in ES :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: Body for ES indexing """ g = Graph().parse(data=string) jldstr = g.serialize(format='json-ld', indent=4) if bibo: esdoc = jsonld.compact(loads(jldstr.decode('utf-8')), self.loadjson(self.frame)) doctype = 'document' else: esdoc = loads(jldstr.decode('utf-8')) esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0] esdoc['@context'] = self.loadjson(self.frame)['@context'] doctype = 'bibliographicResource' docid = re.findall('\w{9}', esdoc['@id'])[0] if self.filemode: bulkfile = [{ 'index': { '_index': self.index, '_type': doctype, '_id': docid } }, esdoc] return bulkfile else: esdoc.update({ '_index': self.index, '_type': doctype, '_id': docid }) return esdoc def bulkupload(self, string, bibo): """ Creates a list of single JSON-LD documents and indexes them as bulk upload :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: """ if not self.filemode: self.bulknum += 1 self.esdocs.append(self.rdf2es(string, bibo)) if self.filemode: # Output content to file #I think we shouldn't serialize the content in memory in the output-file mode for outer in self.esdocs: for inner in outer: #self.of.write(dumps(inner, separators='\n')) #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it #so we can't just write a string dump(inner, self.of) #dump(bytes(inner,'UTF-8'), self.of) self.writtenDocuments += 1 self.of.write('\n') #perhaps flush it only in bigger chunks? - later #self.of.flush() del self.esdocs[:] if self.writtenDocuments >= self.bulksize: self._closeFile() self.writtenDocuments = 0 self._openFile() elif self.bulknum >= self.bulksize: # Perform bulk upload helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True) # Reset counter and empty list self.bulknum = 0 del self.esdocs[:] def _openFile(self): subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) #every time the script is started, the number of current subdirs is again 1 so we neeed to check #hown much files are already stored in the current subdir elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \ >= self.numberOfFilesInSubDir: self.currentSubDir += 1 subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) self.numberOfFilesInSubDir = 0 outfile = "es." + datetime.now().strftime( "%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__( ) + ".json" #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz" absoluteFileName = "".join([subDir, os.sep, outfile]) self.of = open(absoluteFileName, 'w') #self.of = gzip.open(absoluteFileName,'wb') self.numberOfFilesInSubDir += 1 def _closeFile(self): if not self.of is None: #last return necessary for bulk API self.of.write("\n") self.of.flush() name = self.of.name self.of.close() os.system("gzip " + name) self.of = None
def main(): p = ArgumentParser( description=textwrap.dedent('''\ example usage: $ cif-es-archive '''), formatter_class=RawDescriptionHelpFormatter, prog='cif-es-archive' ) # options p.add_argument("-v", "--verbose", action="store_true", help="logging level: INFO") p.add_argument('-d', '--debug', action="store_true", help="logging level: DEBUG") p.add_argument('-V', '--version', action='version', version=VERSION) p.add_argument('-m', '--months', help='how many months ago to archive [default %(default)s]', default=MONTHS) p.add_argument('--dry-run', action="store_true", help='dry run, do not delete') p.add_argument('--nodes', default=['localhost:9200']) p.add_argument('--limit', help='specify scroll batch limit [default %(default)s]', default=LIMIT) if not os.getenv('CIF_ELASTICSEARCH_TEST', False) == '1': raise SystemError('This has NOT been tested yet, remove this line to test at your own risk!') args = p.parse_args() setup_logging(args) logger = logging.getLogger(__name__) end_month = (datetime.today() - relativedelta(months=int(args.months))) end_month = end_month.strftime('%Y.%m') logger.info('month: {}'.format(end_month)) es = Elasticsearch(args.nodes, timeout=120, max_retries=10, retry_on_timeout=True) monthlies = es.indices.get_alias(index='{}-*.*'.format('cif.indicators')).keys() to_archive = {} for m in monthlies: match = re.search(r"^cif\.indicators-(\d{4}\.\d{2})$", m) if match.group(1) < end_month: to_archive['{}-{}'.format(args.index_prefix, match.group(1))] = '{}-{}'.format(args.index_prefix, match.group(1)) # https://www.elastic.co/guide/en/elasticsearch/reference/1.4/docs-delete-by-query.html # http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.delete_by_query # http://stackoverflow.com/questions/26808239/elasticsearch-python-api-delete-documents-by-query pprint(to_archive) yearlies = () for c in to_archive: logger.info('archiving: {}'.format(c)) match = re.search(r"^cif\.indicators-(\d{4}).\d{2}$", c) i = 'cif.indicators-' + str(match.group(1)) logger.debug(i) # check to see if yearly bucket exists? if not es.indices.exists(index=i): logger.debug("building: %s" % i) idx = Index(i) idx.aliases(live={}) idx.doc_type(Indicator) idx.settings(max_results_window=WINDOW_LIMIT) idx.create() es.indices.flush(idx) yearlies.add(i) # aggregate index into yearly bucket # based on provider, tags(?), indicator data = () for d in elasticsearch.helpers.scroll(es, scroll='60m', size=args.limit): i = (d['indicator'], d['provider'], data['group'], sorted(d['tags']).join(',')) if i not in data: data[i].add(d) else: i = data[i] i['count'] += d['count'] if i['lasttime'] < d['lasttime']: i['lasttime'] = d['lasttime'] if i['reporttime'] > d['reporttime']: i['reporttime'] = d['reporttime'] if i['firsttime'] > d['firsttime']: i['firsttime'] = d['firsttime'] if not i['message']: i['message'] = [] if d['message']: i['message'].append(d['message']) if len(data) == 0: logger.info('nothing to archive...') continue actions = [{'_index': 'cif.indicators-2017', '_type': 'indicator', '_source': d} for d in data] # add to yearly if not args.dry_run: helpers.bulk(es, actions) logger.debug('flushing...') if es.flush(): logger.debug('removing %s' % c) # remove old index if not args.dry_run: es.indices.delete(index=c, wait_for_completion=True) # optimize yearlies for y in yearlies: logger.debug('optimizing: %s' % y) if not args.dry_run: es.indices.optimize(index=y) logger.debug('optimized: %s' % y)
def main(): p = ArgumentParser(description=textwrap.dedent('''\ example usage: $ cif-es-archive '''), formatter_class=RawDescriptionHelpFormatter, prog='cif-es-archive') # options p.add_argument("-v", "--verbose", action="store_true", help="logging level: INFO") p.add_argument('-d', '--debug', action="store_true", help="logging level: DEBUG") p.add_argument('-V', '--version', action='version', version=VERSION) p.add_argument('-m', '--months', help='how many months ago to archive [default %(default)s]', default=MONTHS) p.add_argument('--dry-run', action="store_true", help='dry run, do not delete') p.add_argument('--nodes', default=['localhost:9200']) p.add_argument('--limit', help='specify scroll batch limit [default %(default)s]', default=LIMIT) if not os.getenv('CIF_ELASTICSEARCH_TEST', False) == '1': raise SystemError( 'This has NOT been tested yet, remove this line to test at your own risk!' ) args = p.parse_args() setup_logging(args) logger = logging.getLogger(__name__) end_month = (datetime.today() - relativedelta(months=int(args.months))) end_month = end_month.strftime('%Y.%m') logger.info('month: {}'.format(end_month)) es = Elasticsearch(args.nodes, timeout=120, max_retries=10, retry_on_timeout=True) monthlies = es.indices.get_alias( index='{}-*.*'.format('cif.indicators')).keys() to_archive = {} for m in monthlies: match = re.search(r"^cif\.indicators-(\d{4}\.\d{2})$", m) if match.group(1) < end_month: to_archive['{}-{}'.format(args.index_prefix, match.group(1))] = '{}-{}'.format( args.index_prefix, match.group(1)) # https://www.elastic.co/guide/en/elasticsearch/reference/1.4/docs-delete-by-query.html # http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.delete_by_query # http://stackoverflow.com/questions/26808239/elasticsearch-python-api-delete-documents-by-query pprint(to_archive) yearlies = () for c in to_archive: logger.info('archiving: {}'.format(c)) match = re.search(r"^cif\.indicators-(\d{4}).\d{2}$", c) i = 'cif.indicators-' + str(match.group(1)) logger.debug(i) # check to see if yearly bucket exists? if not es.indices.exists(index=i): logger.debug("building: %s" % i) idx = Index(i) idx.aliases(live={}) idx.doc_type(Indicator) idx.settings(max_results_window=WINDOW_LIMIT) idx.create() es.indices.flush(idx) yearlies.add(i) # aggregate index into yearly bucket # based on provider, tags(?), indicator data = () for d in elasticsearch.helpers.scroll(es, scroll='60m', size=args.limit): i = (d['indicator'], d['provider'], data['group'], sorted(d['tags']).join(',')) if i not in data: data[i].add(d) else: i = data[i] i['count'] += d['count'] if i['lasttime'] < d['lasttime']: i['lasttime'] = d['lasttime'] if i['reporttime'] > d['reporttime']: i['reporttime'] = d['reporttime'] if i['firsttime'] > d['firsttime']: i['firsttime'] = d['firsttime'] if not i['message']: i['message'] = [] if d['message']: i['message'].append(d['message']) if len(data) == 0: logger.info('nothing to archive...') continue actions = [{ '_index': 'cif.indicators-2017', '_type': 'indicator', '_source': d } for d in data] # add to yearly if not args.dry_run: helpers.bulk(es, actions) logger.debug('flushing...') if es.flush(): logger.debug('removing %s' % c) # remove old index if not args.dry_run: es.indices.delete(index=c, wait_for_completion=True) # optimize yearlies for y in yearlies: logger.debug('optimizing: %s' % y) if not args.dry_run: es.indices.optimize(index=y) logger.debug('optimized: %s' % y)
class Rdfxml2Es: def __init__(self, file, frame, host, port, esindex, indctrl, bulksize, devmode, filemode, outsubDir): """ 1) Initializes some attributes 2) Checks if connection to ES node can be established 3) Checks if ES index does not already exist 4) If 2) und 3) are true, then create index and type mappings :param file: The RDF-XML file :param frame: File containing the JSON-LD framing :param host: Host of ES node :param port: Port of ES node :param esindex: Name of ES index :param indctrl: Settings and mapping for ES :param bulksize: Size of bulk uploads :param devmode: Number of samples for performing performance :param filemode: :param outsubDir: test on different bulk upload sizes :return: None """ self.file = file self.frame = frame self.host = host self.port = port self.index = esindex self.indctrl = indctrl self.bulksize = bulksize self.bulknum = 0 self.devmode = devmode self.filemode = filemode self.esdocs = list() self.outsubDir = outsubDir self.numberOfFilesInSubDir = 300 self.openedFilesInSubDir = 0 self.currentSubDir = 1 self.writtenDocuments = 0 if self.devmode > 0: self.doccounter = 0 if self.filemode: self._openFile() #self.of = open('output.json', 'w') else: try: h1 = client.HTTPConnection(self.host, self.port) h1.connect() h1.close() self.of = Elasticsearch([{'host': self.host, 'port': self.port}]) if not self.of.indices.exists(self.index) is True: if self.indctrl is not None: self.of.indices.create(index=self.index, body=self.loadjson(self.indctrl)) else: self.of.indices.create(index=self.index) except Exception as inst: exit("Error: " + inst.args[1]) @staticmethod def loadjson(ifile): """ Loads a file containing valid JSON-LD objects and removes comments :param ifile: :return: Object of type Dictionary """ with open(ifile, 'r') as f: raw = f.read() jsonstr = jsmin(raw) return loads(jsonstr) @staticmethod def stripchars(string): """ Removes tabs and newlines from string. :param string: :return: Cleaned string """ return ''.join(re.split('\t+|\n+', string)) def parsexml(self): """ Parses XML and kicks off the transformation and indexing of the individual documents. Must be implemented in child classes :return: None """ raise NotImplementedError def rdf2es(self, string, bibo): """ Does the really interesting stuff: Transformation of the triples by subject and indexing in ES :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: Body for ES indexing """ g = Graph().parse(data=string) jldstr = g.serialize(format='json-ld', indent=4) if bibo: esdoc = jsonld.compact(loads(jldstr.decode('utf-8')), self.loadjson(self.frame)) doctype = 'document' else: esdoc = loads(jldstr.decode('utf-8')) esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0] esdoc['@context'] = self.loadjson(self.frame)['@context'] doctype = 'bibliographicResource' docid = re.findall('\w{9}', esdoc['@id'])[0] if self.filemode: bulkfile = [{'index': {'_index': self.index, '_type': doctype, '_id': docid}}, esdoc] return bulkfile else: esdoc.update({'_index': self.index, '_type': doctype, '_id': docid}) return esdoc def bulkupload(self, string, bibo): """ Creates a list of single JSON-LD documents and indexes them as bulk upload :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: """ if not self.filemode: self.bulknum += 1 self.esdocs.append(self.rdf2es(string, bibo)) if self.filemode: # Output content to file #I think we shouldn't serialize the content in memory in the output-file mode for outer in self.esdocs: for inner in outer: #self.of.write(dumps(inner, separators='\n')) #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it #so we can't just write a string dump(inner, self.of) #dump(bytes(inner,'UTF-8'), self.of) self.writtenDocuments += 1 self.of.write('\n') #perhaps flush it only in bigger chunks? - later #self.of.flush() del self.esdocs[:] if self.writtenDocuments >= self.bulksize: self._closeFile() self.writtenDocuments = 0 self._openFile() elif self.bulknum >= self.bulksize: # Perform bulk upload helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True) # Reset counter and empty list self.bulknum = 0 del self.esdocs[:] def _openFile(self): subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) #every time the script is started, the number of current subdirs is again 1 so we neeed to check #hown much files are already stored in the current subdir elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \ >= self.numberOfFilesInSubDir: self.currentSubDir += 1 subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) self.numberOfFilesInSubDir = 0 outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json" #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz" absoluteFileName = "".join([subDir,os.sep, outfile]) self.of = open(absoluteFileName,'w') #self.of = gzip.open(absoluteFileName,'wb') self.numberOfFilesInSubDir += 1 def _closeFile(self): if not self.of is None: #last return necessary for bulk API self.of.write("\n") self.of.flush() name = self.of.name self.of.close() os.system("gzip " + name) self.of = None