예제 #1
0
    def set_dependency_parser(self, config):
        if isinstance(config, dict):
            helpers.cond_print("Dependency Parser: " + config["name"],
                               self.verbose)
            self.dependency_parser = config["name"]
            if config["name"] == "spacy":
                """
                    Sets the model and returns the Spacy NLP instance. Example ways from the Spacy docs:
                    spacy.load("en") # shortcut link
                    spacy.load("en_core_web_sm") # package
                    spacy.load("/path/to/en") # unicode path
                    spacy.load(Path("/path/to/en")) # pathlib Path
                """
                self.dependency_parser_instance = spacy.load(config["model"])

            elif config["name"] == "corenlp":
                if 'CLASSPATH' not in os.environ:
                    os.environ['CLASSPATH'] = ""

                cpath = config["model"] + os.pathsep + config["parser"]
                if cpath not in os.environ['CLASSPATH']:
                    os.environ['CLASSPATH'] = cpath + os.pathsep + os.environ[
                        'CLASSPATH']

                # TODO:- DEPRECATED
                self.dependency_parser_instance = StanfordDependencyParser(
                    path_to_models_jar=config["model"], encoding='utf8')
            elif config["name"] == "corenlp-server":
                # Requires the CoreNLPServer running in the background at the below URL (generally https://localhost:9000)
                # Start server by running the following command in the JARs directory.
                # `java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -annotators "tokenize,ssplit,pos,lemma,parse,sentiment" -port 9000 -timeout 30000`
                self.dependency_parser_instance = CoreNLPDependencyParser(
                    url=config["url"])
예제 #2
0
 def set_attribute_datatype(self, attr_type_obj):
     # Set new datatype
     for attribute, data_type in attr_type_obj.items():
         if data_type in constants.attribute_types.values():
             self.data_attribute_map[attribute]['dataType'] = data_type
             self.populate_dataset_meta_for_attr(attribute, data_type)
         else:
             helpers.cond_print("Invalid Target DataType. Choose from " +
                                str(constants.attribute_types.values()),
                                debug=True)
             sys.exit(error_codes.BAD_INPUT_ATTRIBUTE_DATA_TYPE)
예제 #3
0
    def analyze_query(self, query_raw, dialog=False, debug=False):
        # type: (str) -> dict

        self.execution_durations = dict()
        self.dialog = dialog
        self.debug = debug

        # If not a follow-up query, reset the output variables.
        if not dialog:
            self.extracted_vis_type = None
            self.extracted_vis_token = None
            self.extracted_tasks = OrderedDict()
            self.extracted_attributes = OrderedDict()
            self.vis_list = None

        # CLEAN AND PROCESS QUERY
        self.query_raw = query_raw
        helpers.cond_print("Raw Query: " + self.query_raw, self.verbose)
        st = time.time()
        self.query_processed = self.query_genie_instance.process_query(
            self.query_raw)
        self.query_tokens = self.query_genie_instance.clean_query_and_get_query_tokens(
            self.query_processed, self.reserve_words, self.ignore_words)
        self.query_ngrams = self.query_genie_instance.get_query_ngrams(
            ' '.join(self.query_tokens))
        self.dependencies = self.query_genie_instance.create_dependency_tree(
            self.query_processed)
        helpers.cond_print("Processed Query: " + self.query_processed,
                           self.verbose)
        self.execution_durations['clean_query'] = time.time() - st

        # DETECT EXPLICIT AND IMPLICIT ATTRIBUTES
        st = time.time()
        self.extracted_attributes = self.attribute_genie_instance.extract_attributes(
            self.query_ngrams)
        helpers.cond_print(
            "Final Extracted Attributes: " +
            str(list(self.extracted_attributes.keys())), self.verbose)
        self.execution_durations['extract_attributes'] = time.time() - st

        # DETECT EXPLICIT VISUALIZATION UTTERANCES
        st = time.time()
        self.extracted_vis_type, self.extracted_vis_token = self.vis_genie_instance.extract_vis_type(
            self.query_ngrams)
        self.execution_durations['extract_vis_type'] = time.time() - st

        # DETECT IMPLICIT AND EXPLICIT TASKS
        st = time.time()
        task_map = self.task_genie_instance.extract_explicit_tasks_from_dependencies(
            self.dependencies)

        # Filters from Domain Values
        task_map = self.task_genie_instance.extract_explicit_tasks_from_domain_value(
            task_map)

        # At this stage, which attributes are encodeable?
        encodeable_attributes = self.attribute_genie_instance.get_encodeable_attributes(
        )

        # INFER tasks based on (encodeable) attribute Datatypes
        task_map = self.task_genie_instance.extract_implicit_tasks_from_attributes(
            task_map, encodeable_attributes)

        # From the generated TaskMap, ensure that the task "keys" are NOT EMPTY LISTS
        self.extracted_tasks = self.task_genie_instance.filter_empty_tasks(
            task_map)
        self.execution_durations['extract_tasks'] = time.time() - st

        # RECOMMEND VISUALIZATIONS FROM ATTRIBUTES, TASKS, and VISUALIZATIONS
        st = time.time()

        # Final list of encodeable attributes in the VIS
        final_encodeable_attributes = self.attribute_genie_instance.update_encodeable_attributes_based_on_tasks(
        )

        self.vis_list = self.vis_genie_instance.get_vis_list(
            attribute_list=final_encodeable_attributes)
        self.execution_durations['get_vis_list'] = time.time() - st
        self.execution_durations['total'] = sum(
            self.execution_durations.values())

        # Prepare output
        output = {
            'status': 'SUCCESS' if len(self.vis_list) > 0 else 'FAILURE',
            'debug': {
                'execution_durations': self.execution_durations
            },
            'query_raw': self.query_raw,
            'query': self.query_processed,
            'dataset': self.data_url,
            'visList': self.vis_list,
            'attributeMap': self.extracted_attributes,
            'taskMap': self.extracted_tasks,
            'followUpQuery': self.dialog,
            'contextObj': None
        }

        return output if debug else helpers.delete_keys_from_dict(
            output, keys=constants.keys_to_delete_in_output)