def set_model(self): # load pre-trained, before clustering normalized word2ec self.org_model = gensim.models.KeyedVectors.load( self.pre_cluster_we_file, mmap='r').wv # mmap the large matrix as read-only self.org_model.syn0norm = self.org_model.syn0 # load pre-trained, normalized word2ec self.model = gensim.models.KeyedVectors.load( self.we_file, mmap='r').wv # mmap the large matrix as read-only self.model.syn0norm = self.model.syn0 #load adjectives with multi-sense representation adj_file_names = [ f for f in os.listdir(adj_clusters_path) if os.path.isfile(os.path.join(adj_clusters_path, f)) ] self.multi_sense_adj = dict.fromkeys( [os.path.splitext(f)[0].split('_')[0] for f in adj_file_names]) logger.info("Total multi sense adjectives = [{}]".format( len(self.multi_sense_adj))) #generate list of all the words with word vectors self.vocab = dict(self.model.vocab, **self.multi_sense_adj) logger.info("VOCAB SIZE =[{}]".format(len(self.vocab)))
def get(): try: sentence = request.args.get('avis', None) except Exception: abort(http_codes.SERVER_ERROR, "Erreur lors du chargement du modèle") logger.info("Analyse de {sentence}".format(sentence=sentence)) opinions, summury = sentiment_analysis(model_tag, model_sa, flags, source_count, source_word2idx, sentence, fr_nlp, wiki_model) response = { 'aspects': [{ 'target': opinion[0], 'category': opinion[1], 'from': opinion[2], 'to': opinion[3], 'sentiment': opinion[4], 'exemple': opinion[5] } for opinion in opinions], 'summury': [{ 'category': sum[0], 'sentiment': sum[1] } for sum in summury], } logger.info(response) return _success(response)
def online_training(self, epochs=EPHOCS): running_loss = 0.0 y_train = self.data.y_train x_train = self.data.x_train indices = range(y_train.shape[0]) for epoch in range(epochs): logger.info("Epoch: {}".format(epoch)) random.shuffle(indices) for i in indices: x = Variable(torch.Tensor(x_train[[i]])) y = Variable(torch.Tensor(y_train[[i]]), requires_grad=False) # pytorch doesn't support directly in training without batching so this is kind of a hack x.unsqueeze(0) y.unsqueeze(0) # Forward pass: Compute predicted y by passing x to the model y_pred = self.nn_model(x) # Compute and print loss loss = self.criterion(y_pred, y) # print(epoch, loss.data[0]) # Zero gradients, perform a backward pass, and update the weights. self.optimizer.zero_grad() loss.backward() self.optimizer.step() # print statistics # running_loss += loss.data[0] # if i % 100 == 99: # # print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) # running_loss = 0.0 logger.info("Done online training")
def main(args): # global parser, args, dev_triplets, test_triplets, we_wrapper, data_handler, model parser = argparse.ArgumentParser(description='Train word2vec model.') parser.add_argument('dev_file', help='dev input file') parser.add_argument('test_file', help='test input file') parser.add_argument('we_file', help='word embeddings normed model file') # parser.add_argument('output_folder', help='path to the output folder') parser.add_argument( 'org_we_file', help='path to the original we model file - before adjectives clustering' ) parser.add_argument('-s', '--supervised', default=False, action='store_true', help='train and evaluate also the supervised model') args = parser.parse_args(args) dev_triplets = read_HeiPLAS_data(args.dev_file) test_triplets = read_HeiPLAS_data(args.test_file) # load pre-trained, normalized word2ec we_wrapper = MultiSenseWE(args.org_we_file, args.we_file) we_wrapper.set_model() data_handler = DataHandler(we_wrapper) data_handler.run(dev_triplets, test_triplets) if args.supervised: model = SupervisedModel(data_handler) model.run() model = UnsupervisedModel(data_handler) model.run() logger.info("Done!!!!!")
def run(self, dev_triplets, test_triplets): logger.info("filter training samples") self.train, self.x_train, self.y_train = self.filter_data(dev_triplets) logger.info("filter test samples") self.test, self.x_test, self.y_test = self.filter_data(test_triplets) dev_attributes = set([triplet.attr for triplet in dev_triplets if triplet.attr in self.we_wrapper.vocab]) test_attributes = set([triplet.attr for triplet in test_triplets if triplet.attr in self.we_wrapper.vocab]) self.attributes = dev_attributes.union(test_attributes) self.attr_vecs = {attr: self.we_wrapper.word_vec(attr) for attr in self.attributes}
def _get_settings_from_local_file(file_name='settings.json'): logger.info('Loading settings from local file...') file_path = Path(__file__).parent / file_name loaded_settings = {} if file_path.exists() and file_path.is_file(): with file_path.open() as file: loaded_settings.update(json.load(file)) else: logger.error(f'Failed loading {file_path.absolute()} file') return loaded_settings['parameters']
def _failure(exception, http_code=http_codes.SERVER_ERROR): try: exn = traceback.format_exc(exception) logger.info("EXCEPTION: {}".format(exn)) except: logger.info("EXCEPTION: {}".format(exception)) try: data, code = exception.to_tuple() return make_reponse(data, code) except: try: data = exception.to_dict() return make_reponse(data, exception.http) except Exception: return make_reponse(None, http_code)
def read(): # Assume the file is located in commons # Assume that the file {root-project}/conf/conf.yml exists # # {root-project} # - conf/ # - conf.yml # - ... # - commons/ # - configuration.py # - ... SCRIPT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) path = os.path.join(SCRIPT_DIR, 'conf/api-conf.yml') logger.info("Using conf file: {}".format(path)) return yaml.load(open(path))
def get(): """ Retourne un Qui/Quoi/Ou/Proximité utilisable dans le cadre du portail PJ """ if not entity_models: abort(http_codes.BAD_REQUEST, "Aucun modèle de détection d'entité n'a été chargé") phrase, entity_model_name, redressage_active = None, None, True try: phrase = request.args.get('phrase', None) redressage = request.args.get('redressage', None) if redressage and redressage.lower() == "false": redressage_active = False # next(iter(dictionnary)) => renvoie la 'première' clé d'un dictionnaire entity_model_name = request.args.get('entity_model', next(iter(entity_models))) except Exception: abort(http_codes.SERVER_ERROR, "Erreur lors du chargement du modèle") logger.info("Analyse de {sentence} avec le modèle {entity}".format( sentence=phrase, entity=entity_model_name)) entity_model = None try: entity_model = entity_models.get(entity_model_name) except KeyError: abort( http_codes.BAD_REQUEST, "Le modele de prédiction d'entité {} n'existe pas.".format( entity_model_name)) entity_prediction = entity_model.predict(phrase, redressage_active) response = { 'qui': entity_prediction.to_dict()["qui"], 'quoi': entity_prediction.to_dict()["quoi"], 'ou': entity_prediction.to_dict()["ou"], 'proximite': entity_prediction.to_dict()["proximite"], 'analyse': entity_prediction.to_dict()["analyse"] } logger.info(response) return _success(response)
def filter_data(self, triplets): logger.info("before filter missing words, samples: " + str(len(triplets))) filtered_data = [samp for samp in triplets if samp.adj in self.we_wrapper.vocab and samp.noun in self.we_wrapper.vocab and samp.attr in self.we_wrapper.vocab ] logger.info("after filter missing words, samples: " + str(len(filtered_data))) x_matrix = np.array([self.we_wrapper.adj_vec_by_context(samp.adj,samp.noun) for samp in filtered_data]) y_matrix = np.array([self.we_wrapper.word_vec(samp.attr) for samp in filtered_data]) logger.info("x shape: "+ str(x_matrix.shape)) logger.info("y_train: " + str(y_matrix.shape)) return filtered_data, x_matrix, y_matrix
def main(args): # global start_time, parser, args, free_cores, cores, uptime_data, load_avg, used_cores, sentences, model_name, model_path, normed_model_path, model, end_time, hours, rem, minutes, seconds start_time = time.time() # Set up command line parameters. parser = argparse.ArgumentParser(description='Train word2vec model.') parser.add_argument( 'input_file', help='input file path for the word embeddings training') args = parser.parse_args(args) free_cores = 1 if PARALLEL_FLAG: cores = multiprocessing.cpu_count() uptime_data = os.popen("uptime").read().split() load_avg = float( uptime_data[-3].strip(',') ) # take the load average of the last minute(the third from the end) used_cores = math.ceil(load_avg / cores) free_cores = min(cores - used_cores, MAX_CORES_TO_USE) logger.info("running with {} threads".format(free_cores)) sentences = LineSentence(args.input_file) model_name = args.input_file model_path = get_we_model_full_path(model_name) normed_model_path = get_normed_we_full_path(model_name) logger.info("Start training word2vec on file: {}".format(args.input_file)) model = Word2Vec(sentences, size=DIMENSION, alpha=LEARNING_RATE, window=CONTEXT_WINDOW, workers=free_cores, iter=EPOCHS) logger.info("done word2ve training") logger.info("saving model to: {}".format(model_path)) model.save(model_path) logger.info("saving normalized model to: {}".format(normed_model_path)) model.init_sims(replace=True) model.save(normed_model_path) end_time = time.time() hours, rem = divmod(end_time - start_time, 3600) minutes, seconds = divmod(rem, 60) logging.info("total training time{:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds))
def main(args): # global start_time, parser, args, runner, stop start_time = timeit.default_timer() parser = argparse.ArgumentParser( description='Generate adjectives senses by nouns clustering') parser.add_argument('sentences_input_file', help='input file path - sentences format') parser.add_argument('word_embeddings_file', help='word embeddings model file path') # parser.add_argument('pickled_adj_folder',help='word embeddings model file path') parser.add_argument( 'sentences_output_file', help= 'the generated file for WE training after adjectives clustering and labeling file path' ) parser.add_argument( '-ss', '--only_sub_set', default=False, action='store_true', help='analyze only subset of adjectives from config file') parser.add_argument('-p', '--outliers_clustering_by_patterns', default=False, action='store_true', help='cluster dbscan outliers using patterns') args = parser.parse_args(args) # logging.basicConfig(filename='adj_sense_extractor.log', level=logging.DEBUG) logger.info('start') logger.info("loading word embedding model from {}".format( args.word_embeddings_file)) we_model.load_model(args.word_embeddings_file) runner = AdjSensesClusteringRunner(args.sentences_input_file, args.sentences_output_file, args.only_sub_set, args.outliers_clustering_by_patterns) runner.run() stop = timeit.default_timer() print "DONE!" print "Total running time {}".format(stop - start_time)
file_path = Path(__file__).parent / file_name loaded_settings = {} if file_path.exists() and file_path.is_file(): with file_path.open() as file: loaded_settings.update(json.load(file)) else: logger.error(f'Failed loading {file_path.absolute()} file') return loaded_settings['parameters'] # Get all the settings from SSM settings = _get_settings_from_local_file() # logging commit_hash and stage-params commit_hash = settings.get('commit_hash') if commit_hash: logger.info( f'Running commit: {commit_hash} and loaded params for stage:{os.environ.get("stage")}' ) else: logger.info('No commit hash on settings') # Add environment variables comming from serverless # Service ID settings['SERVICE'] = os.environ.get('serviceId') # Stage settings['STAGE'] = os.environ.get('stage')
def test(self): we_wrapper = self.data.we_wrapper weights = self.nn_model.linear_1.weight.data.numpy() x_test = self.data.x_test y_test = self.data.y_test attr_vecs = self.data.attr_vecs print "attr_vecs size = {}".format(len(attr_vecs)) print "x test shape: " + str(x_test.shape) print "y_test: " + str(y_test.shape) print "weights shape: {}".format(weights.shape) x_test_matrix = np.dot(weights, np.transpose(x_test)) print "x_test matrix shape = {}".format(x_test_matrix.shape) # check P@1 and P@5 accuracy correct = 0.0 top_5_correct = 0.0 correct_pred = [] false_pred = [] results = [] for i in xrange(0, x_test_matrix.shape[1]): y_pred = x_test_matrix[:, [i]] #calculate cosine similarity for normalized vectors cosine_sims = { attr: np.dot(y_pred.T, attr_vecs[attr]) for attr in attr_vecs.keys() } sorted_sims = dict( sorted(cosine_sims.iteritems(), key=operator.itemgetter(1), reverse=True)[:K]) most_sim_attr = max(sorted_sims, key=lambda i: sorted_sims[i]) if most_sim_attr == self.data.test[i].attr: correct += 1 correct_pred.append(self.data.test[i]) else: false_pred.append((self.data.test[i], most_sim_attr)) if self.data.test[i].attr in sorted_sims.keys(): top_5_correct += 1 results.append((self.data.test[i], most_sim_attr)) logger.info("supervised results") logger.info("correct: {} from total: {}. Accuracy: {}".format( correct, y_test.shape[0], correct / y_test.shape[0])) logger.info("top 5 correct: {} from total: {}. Accuracy: {}".format( top_5_correct, y_test.shape[0], top_5_correct / y_test.shape[0])) with open(correct_predictions_file, 'w') as file: for item in correct_pred: # output = ' '.join([str(item), item[1].upper()]) print >> file, item with open(false_prediction_file, 'w') as file: for item in false_pred: output = ' '.join([str(item[0]), item[1].upper()]) print >> file, output with open(test_results, 'w') as file: for item in results: # output = ' '.join([item[1].upper(), item[0]].adj, item[0].noun) print >> file, str(item[0])
def test(self): x_test = self.data.x_test y_test = self.data.y_test attr_vecs = self.data.attr_vecs logger.info("attr_vecs size = {}".format(len(attr_vecs))) logger.info("x test shape: " + str(x_test.shape)) logger.info("y_test: " + str(y_test.shape)) correct = 0.0 correct_in_K = 0.0 predictions = [] unique_attributes = attr_vecs.keys() attr_vecs_ordered = np.array([ self.data.we_wrapper.word_vec(attr) for attr in unique_attributes ]).squeeze() for test in self.data.test: adj_label = self.data.we_wrapper.get_adj_name(test.adj, test.noun) adj_vec = self.data.we_wrapper.adj_vec_by_context( test.adj, test.noun) # adj_vec = we_wrapper.org_model.word_vec(test.adj) sim = np.dot(adj_vec, attr_vecs_ordered.T) all_attr_idx = sim.argsort()[-244:][::-1] attr_all_preds = [unique_attributes[i] for i in all_attr_idx] # attr_ids = sim.argsort()[-K:][::-1] # adj_preds = [unique_attributes[i] for i in attr_ids] adj_preds = attr_all_preds[:K] correct_pred_idx = attr_all_preds.index(test.attr) predictions.append((AdjNounAttribute(test.adj, test.noun, test.attr), adj_preds[0], adj_label, correct_pred_idx)) if adj_preds[0] == test.attr: correct += 1 if test.attr in adj_preds: correct_in_K += 1 with open(unsupervised_results, 'w') as file: for item in predictions: string = ' '.join( [str(item[0]), item[1].upper(), item[2], str(item[3])]) print >> file, string logger.info("----unsupervised results-----") logger.info("correct = {}, total: {}, accuracy: {}".format( correct, len(self.data.test), correct / len(self.data.test))) logger.info("correct_in_{} = {}, total: {}, accuracy: {}".format( K, correct_in_K, len(self.data.test), correct_in_K / len(self.data.test)))
def access_log(): logger.info("{0} {1}".format(request.method, request.path))
return make_reponse(None, http_code) def make_reponse(p_object=None, status_code=200): """ Fabrique un objet Response à partir d'un p_object et d'un status code """ if p_object is None and status_code == 404: p_object = { "status": { "status_content": [{ "code": "404 - Not Found", "message": "Resource not found" }] } } json_response = jsonify(p_object) json_response.status_code = status_code json_response.content_type = 'application/json;charset=utf-8' json_response.headers['Cache-Control'] = 'max-age=3600' return json_response if __name__ == "__main__": # Run http REST stack logger.info("Run api on {}:{}".format(conf['host'], conf['port'])) app.run(host=conf['host'], port=int(conf['port']), debug=conf['log']['level'] == "DEBUG")