Exemplo n.º 1
0
    def __init__(self, config, trainingDataset, holdOutDataset):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.timeout = 6000000
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        count = self.esClient.count(index=self.corpusIndex,
                                    doc_type=self.corpusType,
                                    body={"query": {
                                        "match_all": {}
                                    }})
        self.corpusSize = count["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        self.workerName = "bayzee.generation.worker"
        self.dispatchers = {}

        #creating worker
        self.worker = DurableChannel(self.workerName, config)
Exemplo n.º 2
0
 def __init__(self, config):
   self.config = config
   self.logger = config["logger"]
   self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
   self.corpusIndex = config["corpus"]["index"]
   self.corpusType = config["corpus"]["type"]
   self.corpusFields = config["corpus"]["text_fields"]
   self.corpusSize = 0
   self.workerName = "bayzee.annotation.worker"
   self.timeout = 6000
   self.processorIndex = config["processor"]["index"]
   self.processorType = config["processor"]["type"]
   self.processorPhraseType = config["processor"]["type"] + "__phrase"
   self.analyzerIndex = self.corpusIndex + "__analysis__"
   self.worker = DurableChannel(self.workerName, config)
   self.dispatchers = {}
Exemplo n.º 3
0
    def __init__(self, config, trainingDataset, holdOutDataset,
                 processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.trainingDataset = trainingDataset
        self.holdOutDataset = holdOutDataset
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.totalPhrasesDispatched = 0
        self.phrasesGenerated = 0
        self.phrasesNotGenerated = 0
        self.timeout = 86400000
        self.dispatcherName = "bayzee.generation.dispatcher"
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)
        self.workerName = "bayzee.generation.worker"
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        config["processor_phrase_type"] = self.processorPhraseType

        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        # creating generation dispatcher
        self.generationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        # creating controle channel
        self.controlChannel = RemoteChannel(self.dispatcherName, config)
Exemplo n.º 4
0
  def __init__(self, config):
    self.config = config
    self.logger = config["logger"]
    self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" + str(config["elasticsearch"]["port"]))
    self.trainD = None
    self.classifier = None
    self.phraseId = None
    self.phraseData = None
    self.processorIndex = config["processor"]["index"]
    self.processorType = config["processor"]["type"]
    self.processorPhraseType = config["processor"]["type"]+"__phrase"
    self.features = self.config["generator"]["features"]
    for module in self.config["processor"]["modules"]:
      self.features = self.features + module["features"]

    self.workerName = "bayzee.classification.worker"
    self.timeout = 600000
    self.dispatchers = {}
    
    #creating worker
    self.worker = DurableChannel(self.workerName, config)
Exemplo n.º 5
0
    def __init__(self, config, processingStartIndex, processingEndIndex):
        self.config = config
        self.logger = config["logger"]
        self.esClient = Elasticsearch(config["elasticsearch"]["host"] + ":" +
                                      str(config["elasticsearch"]["port"]))
        self.bagOfPhrases = {}
        self.corpusIndex = config["corpus"]["index"]
        self.corpusType = config["corpus"]["type"]
        self.corpusFields = config["corpus"]["text_fields"]
        self.corpusSize = 0
        self.processorIndex = config["processor"]["index"]
        self.processorType = config["processor"]["type"]
        self.processorPhraseType = config["processor"]["type"] + "__phrase"
        self.processingPageSize = config["processingPageSize"]
        self.analyzerIndex = self.corpusIndex + "__analysis__"
        self.config["processingStartIndex"] = processingStartIndex
        self.config["processingEndIndex"] = processingEndIndex
        self.config["processingPageSize"] = self.processingPageSize
        self.totalDocumentsDispatched = 0
        self.documentsAnnotated = 0
        self.documentsNotAnnotated = 0
        self.lastDispatcher = False
        self.endProcess = False
        self.dispatcherName = "bayzee.annotation.dispatcher"
        self.workerName = "bayzee.annotation.worker"
        self.timeout = 86400000
        if processingEndIndex != None:
            self.dispatcherName += "." + str(processingStartIndex) + "." + str(
                processingEndIndex)

        analyzerIndexSettings = {
            "index": {
                "analysis": {
                    "analyzer": {
                        "analyzer_shingle": {
                            "type": "custom",
                            "tokenizer": "standard",
                            "filter":
                            ["standard", "lowercase", "filter_shingle"]
                        }
                    },
                    "filter": {
                        "filter_shingle": {
                            "type":
                            "shingle",
                            "max_shingle_size":
                            config["generator"]["maxShingleSize"],
                            "min_shingle_size":
                            config["generator"]["minShingleSize"],
                            "output_unigrams":
                            (config["generator"]["minShingleSize"] == 1)
                        },
                        "filter_stop": {
                            "type": "stop"
                        }
                    }
                }
            }
        }
        analyzerIndexTypeMapping = {
            "properties": {
                "phrase": {
                    "type": "string"
                },
                "document_id": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "phrase__not_analyzed": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
        corpusSize = self.esClient.count(index=self.corpusIndex,
                                         doc_type=self.corpusType,
                                         body={"query": {
                                             "match_all": {}
                                         }})
        self.corpusSize = corpusSize["count"]
        self.featureNames = map(lambda x: x["name"],
                                config["generator"]["features"])
        for module in config["processor"]["modules"]:
            self.featureNames = self.featureNames + map(
                lambda x: x["name"], module["features"])

        if processingStartIndex == 0:
            if self.esClient.indices.exists(self.analyzerIndex):
                self.esClient.indices.delete(self.analyzerIndex)
            data = self.esClient.indices.create(self.analyzerIndex,
                                                analyzerIndexSettings)

        if "annotateFromScratch" not in self.config or self.config[
                "annotateFromScratch"] == True:
            try:
                if self.esClient.indices.exists(
                        self.config["processor"]["index"]):
                    self.esClient.indices.delete(
                        self.config["processor"]["index"])
                self.esClient.indices.create(self.config["processor"]["index"])
                self.esClient.indices.put_mapping(
                    index=self.config["processor"]["index"],
                    doc_type=self.processorPhraseType,
                    body=analyzerIndexTypeMapping)
                if self.esClient.indices.exists(self.analyzerIndex):
                    self.esClient.indices.delete(self.analyzerIndex)
                data = self.esClient.indices.create(self.analyzerIndex,
                                                    analyzerIndexSettings)
            except:
                error = sys.exc_info()
                self.logger.error(
                    "Error occurred during initialization of analyzer index: "
                    + str(error))
                sys.exit(1)
            else:
                sleep(1)

        #dispatcher creation
        self.annotationDispatcher = DurableChannel(self.dispatcherName, config,
                                                   self.timeoutCallback)

        #remote channel intialisation
        self.controlChannel = RemoteChannel(self.dispatcherName, config)