Exemplo n.º 1
0
    def _collect_phrases(self):
        """
        Collect phrases from input tables.

        self.total_phrases should then be a huge list of Phrase objects
        """
        logger.debug('Collecting phrase from input tables')
        for table in self.input_tables:
            self.total_phrases += table.collect_phrases()
Exemplo n.º 2
0
    def _assign_vectors_to_phrases(self):
        """
        Assign vectors to phrases after training.

        Since training is done on the token level, each phrase will get
        a vector based on its tokens' vectors, by taking their average
        """
        logger.debug(
            'Converting vector space model from token level to phrase level')

        self.vector_space_model.create_phrase_model(self.total_phrases)
Exemplo n.º 3
0
    def _phrases_to_tokens(self):
        """
        Create a list of tokens representing the phrase list.

        returns:
            list[list[str]]
        """
        logger.debug('Collecting phrase tokens for training')
        phrases_as_tokens = []
        for phrase in self.total_phrases:
            phrases_as_tokens.append(phrase.tokens)

        return phrases_as_tokens
Exemplo n.º 4
0
    def _phrases_to_dict(self):
        """
        Create a dict of phrases str and their object references.

        returns:
            dict{str:Phrase}
        """
        logger.debug(
            'Creating <phrase_str, phrase_ref> dict to fill output table')

        phrases_dict = {}
        for phrase in self.total_phrases:
            phrases_dict[phrase.raw_form] = phrase

        return phrases_dict
Exemplo n.º 5
0
    def run(self, parsed_config):
        """
        Run the textractor module.

        params:
            parsed_config (dict)
        """
        logger.debug('Starting Textractor with args %s' % parsed_config)

        self.read_input_tables(parsed_config['default']['filled_taxonomies'])
        self.train_vector_space(
            parsed_config['word2vec'],
            parsed_config['default']['vector_space_output'])
        self.read_output_table(parsed_config['default']['empty_taxonomies'])
        self.fill_output_tables(parsed_config['default']['vector_threshold'])
        self.write_output_tables(parsed_config['default']['output_taxonomies'])
Exemplo n.º 6
0
    def fill_output_tables(self, vector_threshold):
        """
        Fill output tables with new phrases.
        """
        logger.info('Filling output tables')

        phrases_dict = self._phrases_to_dict()

        logger.debug('Assigning early attractors using string similarity')
        self.output_table.assign_early_attractors(phrases_dict)

        logger.debug('Assigning remaining phrases using vector similarity')
        for phrase_obj in phrases_dict.values():
            self.output_table.assign_attractor(phrase_obj,
                                               self.vector_space_model,
                                               vector_threshold)
Exemplo n.º 7
0
    def _start_word2vec(self, phrases_as_tokens, train_params):
        """
        Start word2vec training.

        Phrases as tokens represent each taxonomy as series of tokens.
        Example:
        + Software Engineer
        - Software Developer
        - Backend Developer

        will be represented as: [software, engineer, software, developer, ..]

        params:
            phrases_as_tokens (list[list[str]])
            train_params (dict)
        """
        logger.debug('Starting word2vec')
        self.vector_space_model = word2vec.Word2Vec()
        self.vector_space_model.train(phrases_as_tokens, train_params)
Exemplo n.º 8
0
    def train_vector_space(self, train_params, output_path):
        """
        Train vector space model.

        params:
            train_params (dict)
        """
        if io.exists(output_path):
            logger.debug('Vector space already exists, loading %s' %
                         output_path)
            self.vector_space_model = io.read(output_path)
        else:
            logger.info('Training vector space using word2vec')
            self._collect_phrases()
            phrases_as_tokens = self._phrases_to_tokens()
            self._start_word2vec(phrases_as_tokens, train_params)
            self._assign_vectors_to_phrases()
            logger.debug('Writing vector space to %s' %
                         output_path)

            io.write(self.vector_space_model, output_path)