def _collect_phrases(self): """ Collect phrases from input tables. self.total_phrases should then be a huge list of Phrase objects """ logger.debug('Collecting phrase from input tables') for table in self.input_tables: self.total_phrases += table.collect_phrases()
def _assign_vectors_to_phrases(self): """ Assign vectors to phrases after training. Since training is done on the token level, each phrase will get a vector based on its tokens' vectors, by taking their average """ logger.debug( 'Converting vector space model from token level to phrase level') self.vector_space_model.create_phrase_model(self.total_phrases)
def _phrases_to_tokens(self): """ Create a list of tokens representing the phrase list. returns: list[list[str]] """ logger.debug('Collecting phrase tokens for training') phrases_as_tokens = [] for phrase in self.total_phrases: phrases_as_tokens.append(phrase.tokens) return phrases_as_tokens
def _phrases_to_dict(self): """ Create a dict of phrases str and their object references. returns: dict{str:Phrase} """ logger.debug( 'Creating <phrase_str, phrase_ref> dict to fill output table') phrases_dict = {} for phrase in self.total_phrases: phrases_dict[phrase.raw_form] = phrase return phrases_dict
def run(self, parsed_config): """ Run the textractor module. params: parsed_config (dict) """ logger.debug('Starting Textractor with args %s' % parsed_config) self.read_input_tables(parsed_config['default']['filled_taxonomies']) self.train_vector_space( parsed_config['word2vec'], parsed_config['default']['vector_space_output']) self.read_output_table(parsed_config['default']['empty_taxonomies']) self.fill_output_tables(parsed_config['default']['vector_threshold']) self.write_output_tables(parsed_config['default']['output_taxonomies'])
def fill_output_tables(self, vector_threshold): """ Fill output tables with new phrases. """ logger.info('Filling output tables') phrases_dict = self._phrases_to_dict() logger.debug('Assigning early attractors using string similarity') self.output_table.assign_early_attractors(phrases_dict) logger.debug('Assigning remaining phrases using vector similarity') for phrase_obj in phrases_dict.values(): self.output_table.assign_attractor(phrase_obj, self.vector_space_model, vector_threshold)
def _start_word2vec(self, phrases_as_tokens, train_params): """ Start word2vec training. Phrases as tokens represent each taxonomy as series of tokens. Example: + Software Engineer - Software Developer - Backend Developer will be represented as: [software, engineer, software, developer, ..] params: phrases_as_tokens (list[list[str]]) train_params (dict) """ logger.debug('Starting word2vec') self.vector_space_model = word2vec.Word2Vec() self.vector_space_model.train(phrases_as_tokens, train_params)
def train_vector_space(self, train_params, output_path): """ Train vector space model. params: train_params (dict) """ if io.exists(output_path): logger.debug('Vector space already exists, loading %s' % output_path) self.vector_space_model = io.read(output_path) else: logger.info('Training vector space using word2vec') self._collect_phrases() phrases_as_tokens = self._phrases_to_tokens() self._start_word2vec(phrases_as_tokens, train_params) self._assign_vectors_to_phrases() logger.debug('Writing vector space to %s' % output_path) io.write(self.vector_space_model, output_path)