示例#1
0
class Rdfxml2Es:
    def __init__(self, file, frame, host, port, esindex, indctrl, bulksize,
                 devmode, filemode, outsubDir):
        """
        1) Initializes some attributes
        2) Checks if connection to ES node can be established
        3) Checks if ES index does not already exist
        4) If 2) und 3) are true, then create index and type mappings

        :param file: The RDF-XML file
        :param frame: File containing the JSON-LD framing
        :param host: Host of ES node
        :param port: Port of ES node
        :param esindex: Name of ES index
        :param indctrl: Settings and mapping for ES
        :param bulksize: Size of bulk uploads
        :param devmode: Number of samples for performing performance
        :param filemode:
        :param outsubDir:
        test on different bulk upload sizes
        :return: None
        """
        self.file = file
        self.frame = frame
        self.host = host
        self.port = port
        self.index = esindex
        self.indctrl = indctrl
        self.bulksize = bulksize
        self.bulknum = 0
        self.devmode = devmode
        self.filemode = filemode
        self.esdocs = list()
        self.outsubDir = outsubDir
        self.numberOfFilesInSubDir = 300
        self.openedFilesInSubDir = 0
        self.currentSubDir = 1
        self.writtenDocuments = 0
        if self.devmode > 0:
            self.doccounter = 0
        if self.filemode:
            self._openFile()
            #self.of = open('output.json', 'w')
        else:
            try:
                h1 = client.HTTPConnection(self.host, self.port)
                h1.connect()
                h1.close()
                self.of = Elasticsearch([{
                    'host': self.host,
                    'port': self.port
                }])
                if not self.of.indices.exists(self.index) is True:
                    if self.indctrl is not None:
                        self.of.indices.create(index=self.index,
                                               body=self.loadjson(
                                                   self.indctrl))
                    else:
                        self.of.indices.create(index=self.index)
            except Exception as inst:
                exit("Error: " + inst.args[1])

    @staticmethod
    def loadjson(ifile):
        """
        Loads a file containing valid JSON-LD objects and removes comments
        :param ifile:
        :return: Object of type Dictionary
        """
        with open(ifile, 'r') as f:
            raw = f.read()
        jsonstr = jsmin(raw)
        return loads(jsonstr)

    @staticmethod
    def stripchars(string):
        """
        Removes tabs and newlines from string.
        :param string:
        :return: Cleaned string
        """
        return ''.join(re.split('\t+|\n+', string))

    def parsexml(self):
        """
        Parses XML and kicks off the transformation and indexing of the individual documents.
        Must be implemented in child classes
        :return: None
        """
        raise NotImplementedError

    def rdf2es(self, string, bibo):
        """
        Does the really interesting stuff: Transformation of the
        triples by subject and indexing in ES
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return: Body for ES indexing
        """
        g = Graph().parse(data=string)
        jldstr = g.serialize(format='json-ld', indent=4)
        if bibo:
            esdoc = jsonld.compact(loads(jldstr.decode('utf-8')),
                                   self.loadjson(self.frame))
            doctype = 'document'
        else:
            esdoc = loads(jldstr.decode('utf-8'))
            esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0]
            esdoc['@context'] = self.loadjson(self.frame)['@context']
            doctype = 'bibliographicResource'
        docid = re.findall('\w{9}', esdoc['@id'])[0]
        if self.filemode:
            bulkfile = [{
                'index': {
                    '_index': self.index,
                    '_type': doctype,
                    '_id': docid
                }
            }, esdoc]
            return bulkfile
        else:
            esdoc.update({
                '_index': self.index,
                '_type': doctype,
                '_id': docid
            })
            return esdoc

    def bulkupload(self, string, bibo):
        """
        Creates a list of single JSON-LD documents and indexes them as bulk upload
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return:
        """
        if not self.filemode:
            self.bulknum += 1
        self.esdocs.append(self.rdf2es(string, bibo))

        if self.filemode:
            # Output content to file
            #I think we shouldn't serialize the content in memory in the output-file mode

            for outer in self.esdocs:
                for inner in outer:
                    #self.of.write(dumps(inner, separators='\n'))
                    #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it
                    #so we can't just write a string
                    dump(inner, self.of)
                    #dump(bytes(inner,'UTF-8'), self.of)
                    self.writtenDocuments += 1

                    self.of.write('\n')
            #perhaps flush it only in bigger chunks? - later
            #self.of.flush()
            del self.esdocs[:]
            if self.writtenDocuments >= self.bulksize:
                self._closeFile()
                self.writtenDocuments = 0
                self._openFile()

        elif self.bulknum >= self.bulksize:
            # Perform bulk upload
            helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True)
            # Reset counter and empty list
            self.bulknum = 0
            del self.esdocs[:]

    def _openFile(self):

        subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()

        if not os.path.isdir(subDir):
            os.mkdir(subDir)
        #every time the script is started, the number of current subdirs is again 1 so we neeed to check
        #hown much files are already stored in the current subdir
        elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \
                >= self.numberOfFilesInSubDir:
            self.currentSubDir += 1
            subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()
            if not os.path.isdir(subDir):
                os.mkdir(subDir)
            self.numberOfFilesInSubDir = 0

        outfile = "es." + datetime.now().strftime(
            "%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__(
            ) + ".json"

        #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module
        #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz"
        absoluteFileName = "".join([subDir, os.sep, outfile])
        self.of = open(absoluteFileName, 'w')
        #self.of = gzip.open(absoluteFileName,'wb')

        self.numberOfFilesInSubDir += 1

    def _closeFile(self):
        if not self.of is None:
            #last return necessary for bulk API
            self.of.write("\n")
            self.of.flush()
            name = self.of.name
            self.of.close()
            os.system("gzip " + name)
        self.of = None
def main():

    p = ArgumentParser(
        description=textwrap.dedent('''\
        example usage:
            $ cif-es-archive
        '''),
        formatter_class=RawDescriptionHelpFormatter,
        prog='cif-es-archive'
    )

    # options
    p.add_argument("-v", "--verbose", action="store_true", help="logging level: INFO")
    p.add_argument('-d', '--debug', action="store_true", help="logging level: DEBUG")
    p.add_argument('-V', '--version', action='version', version=VERSION)
    p.add_argument('-m', '--months', help='how many months ago to archive [default %(default)s]', default=MONTHS)
    p.add_argument('--dry-run', action="store_true", help='dry run, do not delete')
    p.add_argument('--nodes', default=['localhost:9200'])
    p.add_argument('--limit', help='specify scroll batch limit [default %(default)s]', default=LIMIT)

    if not os.getenv('CIF_ELASTICSEARCH_TEST', False) == '1':
        raise SystemError('This has NOT been tested yet, remove this line to test at your own risk!')

    args = p.parse_args()
    setup_logging(args)
    logger = logging.getLogger(__name__)

    end_month = (datetime.today() - relativedelta(months=int(args.months)))
    end_month = end_month.strftime('%Y.%m')

    logger.info('month: {}'.format(end_month))

    es = Elasticsearch(args.nodes, timeout=120, max_retries=10, retry_on_timeout=True)

    monthlies = es.indices.get_alias(index='{}-*.*'.format('cif.indicators')).keys()
    to_archive = {}
    for m in monthlies:
        match = re.search(r"^cif\.indicators-(\d{4}\.\d{2})$", m)
        if match.group(1) < end_month:
            to_archive['{}-{}'.format(args.index_prefix, match.group(1))] = '{}-{}'.format(args.index_prefix,
                                                                                        match.group(1))

    # https://www.elastic.co/guide/en/elasticsearch/reference/1.4/docs-delete-by-query.html
    # http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.delete_by_query
    # http://stackoverflow.com/questions/26808239/elasticsearch-python-api-delete-documents-by-query

    pprint(to_archive)
    yearlies = ()
    for c in to_archive:
        logger.info('archiving: {}'.format(c))

        match = re.search(r"^cif\.indicators-(\d{4}).\d{2}$", c)
        i = 'cif.indicators-' + str(match.group(1))
        logger.debug(i)
        # check to see if yearly bucket exists?
        if not es.indices.exists(index=i):
            logger.debug("building: %s" % i)

            idx = Index(i)
            idx.aliases(live={})
            idx.doc_type(Indicator)
            idx.settings(max_results_window=WINDOW_LIMIT)
            idx.create()
            es.indices.flush(idx)

            yearlies.add(i)

        # aggregate index into yearly bucket
        # based on provider, tags(?), indicator
        data = ()
        for d in elasticsearch.helpers.scroll(es, scroll='60m', size=args.limit):
            i = (d['indicator'], d['provider'], data['group'], sorted(d['tags']).join(','))

            if i not in data:
                data[i].add(d)
            else:
                i = data[i]
                i['count'] += d['count']

                if i['lasttime'] < d['lasttime']:
                    i['lasttime'] = d['lasttime']

                if i['reporttime'] > d['reporttime']:
                    i['reporttime'] = d['reporttime']

                if i['firsttime'] > d['firsttime']:
                    i['firsttime'] = d['firsttime']

                if not i['message']:
                    i['message'] = []

                if d['message']:
                    i['message'].append(d['message'])

        if len(data) == 0:
            logger.info('nothing to archive...')
            continue

        actions = [{'_index': 'cif.indicators-2017', '_type': 'indicator', '_source': d} for d in data]

        # add to yearly
        if not args.dry_run:
            helpers.bulk(es, actions)

        logger.debug('flushing...')
        if es.flush():
            logger.debug('removing %s' % c)
            # remove old index
            if not args.dry_run:
                es.indices.delete(index=c, wait_for_completion=True)

    # optimize yearlies
    for y in yearlies:
        logger.debug('optimizing: %s' % y)
        if not args.dry_run:
            es.indices.optimize(index=y)

        logger.debug('optimized: %s' % y)
示例#3
0
def main():

    p = ArgumentParser(description=textwrap.dedent('''\
        example usage:
            $ cif-es-archive
        '''),
                       formatter_class=RawDescriptionHelpFormatter,
                       prog='cif-es-archive')

    # options
    p.add_argument("-v",
                   "--verbose",
                   action="store_true",
                   help="logging level: INFO")
    p.add_argument('-d',
                   '--debug',
                   action="store_true",
                   help="logging level: DEBUG")
    p.add_argument('-V', '--version', action='version', version=VERSION)
    p.add_argument('-m',
                   '--months',
                   help='how many months ago to archive [default %(default)s]',
                   default=MONTHS)
    p.add_argument('--dry-run',
                   action="store_true",
                   help='dry run, do not delete')
    p.add_argument('--nodes', default=['localhost:9200'])
    p.add_argument('--limit',
                   help='specify scroll batch limit [default %(default)s]',
                   default=LIMIT)

    if not os.getenv('CIF_ELASTICSEARCH_TEST', False) == '1':
        raise SystemError(
            'This has NOT been tested yet, remove this line to test at your own risk!'
        )

    args = p.parse_args()
    setup_logging(args)
    logger = logging.getLogger(__name__)

    end_month = (datetime.today() - relativedelta(months=int(args.months)))
    end_month = end_month.strftime('%Y.%m')

    logger.info('month: {}'.format(end_month))

    es = Elasticsearch(args.nodes,
                       timeout=120,
                       max_retries=10,
                       retry_on_timeout=True)

    monthlies = es.indices.get_alias(
        index='{}-*.*'.format('cif.indicators')).keys()
    to_archive = {}
    for m in monthlies:
        match = re.search(r"^cif\.indicators-(\d{4}\.\d{2})$", m)
        if match.group(1) < end_month:
            to_archive['{}-{}'.format(args.index_prefix,
                                      match.group(1))] = '{}-{}'.format(
                                          args.index_prefix, match.group(1))

    # https://www.elastic.co/guide/en/elasticsearch/reference/1.4/docs-delete-by-query.html
    # http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.delete_by_query
    # http://stackoverflow.com/questions/26808239/elasticsearch-python-api-delete-documents-by-query

    pprint(to_archive)
    yearlies = ()
    for c in to_archive:
        logger.info('archiving: {}'.format(c))

        match = re.search(r"^cif\.indicators-(\d{4}).\d{2}$", c)
        i = 'cif.indicators-' + str(match.group(1))
        logger.debug(i)
        # check to see if yearly bucket exists?
        if not es.indices.exists(index=i):
            logger.debug("building: %s" % i)

            idx = Index(i)
            idx.aliases(live={})
            idx.doc_type(Indicator)
            idx.settings(max_results_window=WINDOW_LIMIT)
            idx.create()
            es.indices.flush(idx)

            yearlies.add(i)

        # aggregate index into yearly bucket
        # based on provider, tags(?), indicator
        data = ()
        for d in elasticsearch.helpers.scroll(es,
                                              scroll='60m',
                                              size=args.limit):
            i = (d['indicator'], d['provider'], data['group'],
                 sorted(d['tags']).join(','))

            if i not in data:
                data[i].add(d)
            else:
                i = data[i]
                i['count'] += d['count']

                if i['lasttime'] < d['lasttime']:
                    i['lasttime'] = d['lasttime']

                if i['reporttime'] > d['reporttime']:
                    i['reporttime'] = d['reporttime']

                if i['firsttime'] > d['firsttime']:
                    i['firsttime'] = d['firsttime']

                if not i['message']:
                    i['message'] = []

                if d['message']:
                    i['message'].append(d['message'])

        if len(data) == 0:
            logger.info('nothing to archive...')
            continue

        actions = [{
            '_index': 'cif.indicators-2017',
            '_type': 'indicator',
            '_source': d
        } for d in data]

        # add to yearly
        if not args.dry_run:
            helpers.bulk(es, actions)

        logger.debug('flushing...')
        if es.flush():
            logger.debug('removing %s' % c)
            # remove old index
            if not args.dry_run:
                es.indices.delete(index=c, wait_for_completion=True)

    # optimize yearlies
    for y in yearlies:
        logger.debug('optimizing: %s' % y)
        if not args.dry_run:
            es.indices.optimize(index=y)

        logger.debug('optimized: %s' % y)
示例#4
0
class Rdfxml2Es:

    def __init__(self, file, frame, host, port, esindex, indctrl, bulksize, devmode, filemode, outsubDir):
        """
        1) Initializes some attributes
        2) Checks if connection to ES node can be established
        3) Checks if ES index does not already exist
        4) If 2) und 3) are true, then create index and type mappings

        :param file: The RDF-XML file
        :param frame: File containing the JSON-LD framing
        :param host: Host of ES node
        :param port: Port of ES node
        :param esindex: Name of ES index
        :param indctrl: Settings and mapping for ES
        :param bulksize: Size of bulk uploads
        :param devmode: Number of samples for performing performance
        :param filemode:
        :param outsubDir:
        test on different bulk upload sizes
        :return: None
        """
        self.file = file
        self.frame = frame
        self.host = host
        self.port = port
        self.index = esindex
        self.indctrl = indctrl
        self.bulksize = bulksize
        self.bulknum = 0
        self.devmode = devmode
        self.filemode = filemode
        self.esdocs = list()
        self.outsubDir = outsubDir
        self.numberOfFilesInSubDir = 300
        self.openedFilesInSubDir = 0
        self.currentSubDir = 1
        self.writtenDocuments = 0
        if self.devmode > 0:
            self.doccounter = 0
        if self.filemode:
            self._openFile()
            #self.of = open('output.json', 'w')
        else:
            try:
                h1 = client.HTTPConnection(self.host, self.port)
                h1.connect()
                h1.close()
                self.of = Elasticsearch([{'host': self.host, 'port': self.port}])
                if  not self.of.indices.exists(self.index) is True:
                    if self.indctrl is not None:
                        self.of.indices.create(index=self.index, body=self.loadjson(self.indctrl))
                    else:
                        self.of.indices.create(index=self.index)
            except Exception as inst:
                exit("Error: " + inst.args[1])

    @staticmethod
    def loadjson(ifile):
        """
        Loads a file containing valid JSON-LD objects and removes comments
        :param ifile:
        :return: Object of type Dictionary
        """
        with open(ifile, 'r') as f:
            raw = f.read()
        jsonstr = jsmin(raw)
        return loads(jsonstr)

    @staticmethod
    def stripchars(string):
        """
        Removes tabs and newlines from string.
        :param string:
        :return: Cleaned string
        """
        return ''.join(re.split('\t+|\n+', string))

    def parsexml(self):
        """
        Parses XML and kicks off the transformation and indexing of the individual documents.
        Must be implemented in child classes
        :return: None
        """
        raise NotImplementedError

    def rdf2es(self, string, bibo):
        """
        Does the really interesting stuff: Transformation of the
        triples by subject and indexing in ES
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return: Body for ES indexing
        """
        g = Graph().parse(data=string)
        jldstr = g.serialize(format='json-ld',
                             indent=4)
        if bibo:
            esdoc = jsonld.compact(loads(jldstr.decode('utf-8')), self.loadjson(self.frame))
            doctype = 'document'
        else:
            esdoc = loads(jldstr.decode('utf-8'))
            esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0]
            esdoc['@context'] = self.loadjson(self.frame)['@context']
            doctype = 'bibliographicResource'
        docid = re.findall('\w{9}', esdoc['@id'])[0]
        if self.filemode:
            bulkfile = [{'index': {'_index': self.index, '_type': doctype, '_id': docid}}, esdoc]
            return bulkfile
        else:
            esdoc.update({'_index': self.index, '_type': doctype, '_id': docid})
            return esdoc

    def bulkupload(self, string, bibo):
        """
        Creates a list of single JSON-LD documents and indexes them as bulk upload
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return:
        """
        if not self.filemode:
            self.bulknum += 1
        self.esdocs.append(self.rdf2es(string, bibo))

        if self.filemode:
            # Output content to file
            #I think we shouldn't serialize the content in memory in the output-file mode

            for outer in self.esdocs:
                for inner in outer:
                    #self.of.write(dumps(inner, separators='\n'))
                    #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it
                    #so we can't just write a string
                    dump(inner, self.of)
                    #dump(bytes(inner,'UTF-8'), self.of)
                    self.writtenDocuments += 1

                    self.of.write('\n')
            #perhaps flush it only in bigger chunks? - later
            #self.of.flush()
            del self.esdocs[:]
            if self.writtenDocuments >= self.bulksize:
                self._closeFile()
                self.writtenDocuments = 0
                self._openFile()

        elif self.bulknum >= self.bulksize:
            # Perform bulk upload
            helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True)
            # Reset counter and empty list
            self.bulknum = 0
            del self.esdocs[:]

    def _openFile(self):

        subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()

        if not os.path.isdir(subDir):
            os.mkdir(subDir)
        #every time the script is started, the number of current subdirs is again 1 so we neeed to check
        #hown much files are already stored in the current subdir
        elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \
                >= self.numberOfFilesInSubDir:
            self.currentSubDir += 1
            subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()
            if not os.path.isdir(subDir):
                os.mkdir(subDir)
            self.numberOfFilesInSubDir = 0

        outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json"

        #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module
        #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz"
        absoluteFileName = "".join([subDir,os.sep, outfile])
        self.of = open(absoluteFileName,'w')
        #self.of = gzip.open(absoluteFileName,'wb')

        self.numberOfFilesInSubDir += 1

    def _closeFile(self):
        if not self.of is None:
            #last return necessary for bulk API
            self.of.write("\n")
            self.of.flush()
            name = self.of.name
            self.of.close()
            os.system("gzip " + name)
        self.of = None