def __init__(self, config, trainingDataset, holdOutDataset): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.timeout = 6000000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" count = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = count["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) self.workerName = "bayzee.generation.worker" self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config)
def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.workerName = "bayzee.annotation.worker" self.timeout = 6000 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.analyzerIndex = self.corpusIndex + "__analysis__" self.worker = DurableChannel(self.workerName, config) self.dispatchers = {}
def __init__(self, config, trainingDataset, holdOutDataset, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainingDataset = trainingDataset self.holdOutDataset = holdOutDataset self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.totalPhrasesDispatched = 0 self.phrasesGenerated = 0 self.phrasesNotGenerated = 0 self.timeout = 86400000 self.dispatcherName = "bayzee.generation.dispatcher" if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) self.workerName = "bayzee.generation.worker" self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] config["processor_phrase_type"] = self.processorPhraseType self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) # creating generation dispatcher self.generationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) # creating controle channel self.controlChannel = RemoteChannel(self.dispatcherName, config)
def __init__(self, config): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.trainD = None self.classifier = None self.phraseId = None self.phraseData = None self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"]+"__phrase" self.features = self.config["generator"]["features"] for module in self.config["processor"]["modules"]: self.features = self.features + module["features"] self.workerName = "bayzee.classification.worker" self.timeout = 600000 self.dispatchers = {} #creating worker self.worker = DurableChannel(self.workerName, config)
def __init__(self, config, processingStartIndex, processingEndIndex): self.config = config self.logger = config["logger"] self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"])) self.bagOfPhrases = {} self.corpusIndex = config["corpus"]["index"] self.corpusType = config["corpus"]["type"] self.corpusFields = config["corpus"]["text_fields"] self.corpusSize = 0 self.processorIndex = config["processor"]["index"] self.processorType = config["processor"]["type"] self.processorPhraseType = config["processor"]["type"] + "__phrase" self.processingPageSize = config["processingPageSize"] self.analyzerIndex = self.corpusIndex + "__analysis__" self.config["processingStartIndex"] = processingStartIndex self.config["processingEndIndex"] = processingEndIndex self.config["processingPageSize"] = self.processingPageSize self.totalDocumentsDispatched = 0 self.documentsAnnotated = 0 self.documentsNotAnnotated = 0 self.lastDispatcher = False self.endProcess = False self.dispatcherName = "bayzee.annotation.dispatcher" self.workerName = "bayzee.annotation.worker" self.timeout = 86400000 if processingEndIndex != None: self.dispatcherName += "." + str(processingStartIndex) + "." + str( processingEndIndex) analyzerIndexSettings = { "index": { "analysis": { "analyzer": { "analyzer_shingle": { "type": "custom", "tokenizer": "standard", "filter": ["standard", "lowercase", "filter_shingle"] } }, "filter": { "filter_shingle": { "type": "shingle", "max_shingle_size": config["generator"]["maxShingleSize"], "min_shingle_size": config["generator"]["minShingleSize"], "output_unigrams": (config["generator"]["minShingleSize"] == 1) }, "filter_stop": { "type": "stop" } } } } } analyzerIndexTypeMapping = { "properties": { "phrase": { "type": "string" }, "document_id": { "type": "string", "index": "not_analyzed" }, "phrase__not_analyzed": { "type": "string", "index": "not_analyzed" } } } corpusSize = self.esClient.count(index=self.corpusIndex, doc_type=self.corpusType, body={"query": { "match_all": {} }}) self.corpusSize = corpusSize["count"] self.featureNames = map(lambda x: x["name"], config["generator"]["features"]) for module in config["processor"]["modules"]: self.featureNames = self.featureNames + map( lambda x: x["name"], module["features"]) if processingStartIndex == 0: if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) if "annotateFromScratch" not in self.config or self.config[ "annotateFromScratch"] == True: try: if self.esClient.indices.exists( self.config["processor"]["index"]): self.esClient.indices.delete( self.config["processor"]["index"]) self.esClient.indices.create(self.config["processor"]["index"]) self.esClient.indices.put_mapping( index=self.config["processor"]["index"], doc_type=self.processorPhraseType, body=analyzerIndexTypeMapping) if self.esClient.indices.exists(self.analyzerIndex): self.esClient.indices.delete(self.analyzerIndex) data = self.esClient.indices.create(self.analyzerIndex, analyzerIndexSettings) except: error = sys.exc_info() self.logger.error( "Error occurred during initialization of analyzer index: " + str(error)) sys.exit(1) else: sleep(1) #dispatcher creation self.annotationDispatcher = DurableChannel(self.dispatcherName, config, self.timeoutCallback) #remote channel intialisation self.controlChannel = RemoteChannel(self.dispatcherName, config)