def load_yelp_data(): business_handler = data_handler.DataHandler('yelp_business_database') user_handler = data_handler.DataHandler('yelp_user_database') load_businesses(business_handler) load_users(user_handler) load_reviews(business_handler=business_handler, user_handler=user_handler) load_tips(business_handler=business_handler, user_handler=user_handler) load_stars(user_handler)
def _write_to_db(new_entity): handler = data_handler.DataHandler() # if entity by that name already exists, remove it if handler.get_entities({"name": new_entity["name"]}): handler.remove_entities({"name": new_entity["name"]}) # add the new entity handler.create_entity(new_entity)
def __init__(self, word_vec_size, num_compare_entities, db_name=SETTINGS['default_db']): self.word_vectors = KeyedVectors.load_word2vec_format(SETTINGS['word_vec_source'], binary=True) self.handler = data_handler.DataHandler(db_name) self.word_vec_size = word_vec_size self.num_compare_entities = num_compare_entities self.total_entity_count = self.handler.entity_count() self.entity_dict = self._create_entity_dict(self.handler)
def test_embeddings_with_ids(embeds, tasks=TASKS, data_gen=None, truncate=True, embed_size=300, db='person2vec_database', callbacks=[]): handler = data_handler.DataHandler(db) # can pass a training_data_generator to save time, but, if none is passed, create one if not data_gen: data_gen = training_data_generator.EmbeddingDataGenerator(300, 4) if 'biz_type' in tasks: entities = _get_entities_from_db(handler, '_id') else: entities = _get_entities_from_db(handler) return _run_tasks(tasks, entities, embeds, truncate, data_gen, embed_size, callbacks)
def test_word2vec(word2vec_object, tasks=TASKS, data_gen=None, embed_size=300): handler = data_handler.DataHandler() # can pass a training_data_generator to save time, but, if none is passed, create one if not data_gen: data_gen = training_data_generator.EmbeddingDataGenerator(300, 4) entities = _get_entities_from_db(handler) entities = entities.drop([name for name in entities.index.values if _name_not_has_vec(name, data_gen)]) word_vecs = _associate_names_with_word_vecs(entities, data_gen) word_vecs.reset_index(inplace=True) word_vecs['_id'] = pandas.Series([_get_id_for_name(name, handler) for name in word_vecs['index']]) word_vecs.set_index('index', inplace=True) word_vecs.set_index('_id', inplace=True) _run_tasks(tasks=tasks, entities=entities, embeds=word_vecs, data_gen=data_gen, truncate=False, embed_size=embed_size)
def _build_default_model( num_compare_entities=DEFAULT_SETTINGS['num_compare_entities'], word_vec_size=DEFAULT_SETTINGS['word_vec_size']): # setting variables for size of incoming data handler = data_handler.DataHandler() num_total_entities = handler.entity_count() snip_size = DEFAULT_SETTINGS['snippet_size'] embedding_size = DEFAULT_SETTINGS['embedding_size'] input_tensor_words = Input(shape=( snip_size, word_vec_size, ), dtype='float32', name='word_input') input_tensor_entity = Input(shape=(num_compare_entities, ), dtype='int32', name='entity_input') word_flatten_layer = Flatten()(input_tensor_words) entity_embedding_layer = Embedding( num_total_entities, embedding_size, input_length=num_compare_entities, name='entity_embedding')(input_tensor_entity) entity_embedding_layer = Flatten()(entity_embedding_layer) word_branch = Dense(1000, activation="relu", name='dense_sentence_layer')(word_flatten_layer) joint_embeds = Concatenate(name='joint_embeds')( [word_branch, entity_embedding_layer]) nex = Dense(1000, activation="relu", name='dense_consolidator')(joint_embeds) full_out = Dense(num_compare_entities, activation='softmax', name='final_output')(nex) model = Model([input_tensor_words, input_tensor_entity], full_out) opt = DEFAULT_SETTINGS['optimizer'] loss = DEFAULT_SETTINGS['loss'] model.compile(optimizer=opt, loss=loss, metrics=['accuracy']) return model
def load_yelp_data(): business_handler = data_handler.DataHandler('yelp_business_database_small') load_businesses(business_handler) load_reviews(business_handler=business_handler) load_tips(business_handler=business_handler)
def main(db_name): handler = data_handler.DataHandler(db_name) snippet_creator.snippetize_db(handler) data_gen = training_data_generator.EmbeddingDataGenerator(db_name=db_name) model, data_gen = train.train_model(data_gen=data_gen) handler.save_embeddings_to_db(model, data_gen)
from person2vec.generators import training_data_generator from person2vec import data_handler import numpy as np data_gen = training_data_generator.EmbeddingDataGenerator(300, 4) handler = data_handler.DataHandler() hello_vec = [ -0.05419922, 0.01708984, -0.00527954, 0.33203125, -0.25, -0.01397705, -0.15039062, -0.265625, 0.01647949, 0.3828125, -0.03295898, -0.09716797, -0.16308594, -0.04443359, 0.00946045, 0.18457031, 0.03637695, 0.16601562, 0.36328125, -0.25585938, 0.375, 0.171875, 0.21386719, -0.19921875, 0.13085938, -0.07275391, -0.02819824, 0.11621094, 0.15332031, 0.09082031, 0.06787109, -0.0300293, -0.16894531, -0.20800781, -0.03710938, -0.22753906, 0.26367188, 0.012146, 0.18359375, 0.31054688, -0.10791016, -0.19140625, 0.21582031, 0.13183594, -0.03515625, 0.18554688, -0.30859375, 0.04785156, -0.10986328, 0.14355469, -0.43554688, -0.0378418, 0.10839844, 0.140625, -0.10595703, 0.26171875, -0.17089844, 0.39453125, 0.12597656, -0.27734375, -0.28125, 0.14746094, -0.20996094, 0.02355957, 0.18457031, 0.00445557, -0.27929688, -0.03637695, -0.29296875, 0.19628906, 0.20703125, 0.2890625, -0.20507812, 0.06787109, -0.43164062, -0.10986328, -0.2578125, -0.02331543, 0.11328125, 0.23144531, -0.04418945, 0.10839844, -0.2890625, -0.09521484, -0.10351562, -0.0324707, 0.07763672, -0.13378906, 0.22949219, 0.06298828, 0.08349609, 0.02929688, -0.11474609, 0.00534058, -0.12988281, 0.02514648, 0.08789062, 0.24511719, -0.11474609, -0.296875, -0.59375, -0.29492188, -0.13378906, 0.27734375, -0.04174805, 0.11621094, 0.28320312, 0.00241089, 0.13867188, -0.00683594, -0.30078125, 0.16210938, 0.01171875, -0.13867188, 0.48828125, 0.02880859, 0.02416992, 0.04736328, 0.05859375, -0.23828125, 0.02758789, 0.05981445, -0.03857422, 0.06933594, 0.14941406, -0.10888672, -0.07324219, 0.08789062, 0.27148438, 0.06591797, -0.37890625, -0.26171875, -0.13183594, 0.09570312, -0.3125, 0.10205078, 0.03063965, 0.23632812,