def get_params(): # Get default parameters and adjust with user's input params = c19_parameters.Parameters( database=c19_parameters.Database( local_path="/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v14_02052020_test.sqlite"), embedding=c19_parameters.Embedding( local_path="/home/dynomante/projects/covid-19-kaggle/w2v_parquet_file_new_version.parquet") ) try: params.query.cosine_similarity_threshold = float(request.form["sim_threshold"]) except KeyError: pass try: params.query.minimum_sentences_kept = int(request.form["n_sentence"]) except KeyError: pass try: params.query.number_of_clusters = int(request.form["number_cluster"]) except KeyError: pass try: params.query.min_feature_per_cluster = int(request.form["feature_per_cluster"]) except KeyError: pass return params
def main(query): # Get parameters params = parameters.Parameters(database=parameters.Database( local_path="local_exec/articles_database_v8_07042020.sqlite", kaggle_data_path="local_exec/kaggle_data")) # Load pre-trained word vectors embedding_model = embedding.Embedding( parquet_embedding_path=params.embedding.local_path, embeddings_dimension=params.embedding.dimensions, sentence_embedding_method=params.embedding.word_aggregation_method, weight_vectors=params.embedding.weight_with_tfidf) # Get sentence data (including vector) from sentence table sentences = query_matching.get_sentences_data( db_path=params.database.local_path) # Find the K closest sentence to the query closest_sentences = query_matching.get_k_closest_sentences( db_path=params.database.local_path, query=query, sentences=sentences, embedding_model=embedding_model, k=params.query.top_k_sentences)
def main(): # Get parameters params = parameters.Parameters(first_launch=True) # Load all articles (title, abstract and body) into the 'article' table. database_utilities.create_db_and_load_articles( db_path=params.database.local_path, kaggle_data_path=params.database.kaggle_data_path, first_launch=params.first_launch, load_body=params.preprocessing.load_text_body) # Load pre-trained word vectors embedding_model = embedding.Embedding( parquet_embedding_path=params.embedding.local_path, embeddings_dimension=params.embedding.dimensions, sentence_embedding_method=params.embedding.word_aggregation_method, weight_vectors=params.embedding.weight_with_tfidf) # Pre-process and vectorise all sentences text_preprocessing.pre_process_and_vectorize_texts( embedding_model=embedding_model, db_path=params.database.local_path, first_launch=params.first_launch, stem_words=params.preprocessing.stem_words, remove_num=params.preprocessing.remove_numeric)
def main(): params = parameters.Parameters(database=parameters.Database( local_path="articles_database_v8_07042020.sqlite", kaggle_data_path="kaggle_data")) # Load all articles (title, abstract and body) into the 'article' table. database_utilities.create_db_and_load_articles( db_path=params.database.local_path, kaggle_data_path=params.database.kaggle_data_path, first_launch=params.first_launch, only_newest=params.database.only_newest, only_covid=params.database.only_covid, enable_data_cleaner=params.database.enable_data_cleaner) # Pre-process all sentences (no embedding) text_preprocessing.pre_process_and_vectorize_texts( embedding_model=None, db_path=params.database.local_path, first_launch=params.first_launch, stem_words=params.preprocessing.stem_words, remove_num=params.preprocessing.remove_numeric, batch_size=params.preprocessing.batch_size, max_body_sentences=params.preprocessing.max_body_sentences) # Param have been set up with: https://www.aclweb.org/anthology/W16-2922.pdf w2v_params = { "sg": 1, "hs": 1, "sample": 1e-5, "negative": 10, "min_count": 20, "size": 100, "window": 7, "seed": 42, "workers": os.cpu_count(), "iter": 10 } # Train and save W2V and TFIDF as a parquet file DF.parquet word2vec = word2vec_utilities.W2V(params.database.local_path, tfidf_path="TFIDF.pkl", w2v_path="W2V.bin", w2v_params=w2v_params, parquet_output_path="DF.parquet") word2vec.train()
def main(): params = parameters.Parameters( first_launch=True, database=parameters.Database( local_path= "/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v13_01052020.sqlite", kaggle_data_path= "/home/dynomante/projects/covid-19-kaggle/local_exec/kaggle_data", only_newest=False, only_covid=False), preprocessing=parameters.PreProcessing(max_body_sentences=0, stem_words=False), # embedding=parameters.Embedding( # local_path="resources/global_df_w2v_tfidf.parquet") ) # Load all articles (title, abstract and body) into the 'article' table. database_utilities.create_db_and_load_articles( db_path=params.database.local_path, kaggle_data_path=params.database.kaggle_data_path, first_launch=params.first_launch, only_newest=params.database.only_newest, only_covid=params.database.only_covid, enable_data_cleaner=params.database.enable_data_cleaner) # Load pre-trained word vectors # embedding_model = embedding.Embedding( # parquet_embedding_path=params.embedding.local_path, # embeddings_dimension=params.embedding.dimension, # sentence_embedding_method=params.embedding.word_aggregation_method, # weight_vectors=params.embedding.weight_with_tfidf) embedding_model = None # Pre-process and vectorise all sentences text_preprocessing.pre_process_and_vectorize_texts( embedding_model=embedding_model, db_path=params.database.local_path, first_launch=params.first_launch, stem_words=params.preprocessing.stem_words, remove_num=params.preprocessing.remove_numeric, batch_size=params.preprocessing.batch_size, max_body_sentences=params.preprocessing.max_body_sentences)
weight_vectors=params.embedding.weight_with_tfidf) # Get sentence data (including vector) from sentence table all_db_sentences = query_matching.get_sentences_data( db_path=params.database.local_path) return embedding_model, all_db_sentences if __name__ == "__main__": params = parameters.Parameters( database=parameters.Database( local_path= "/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v14_02052020_test.sqlite", kaggle_data_path= "/home/dynomante/projects/covid-19-kaggle/local_exec/kaggle_data"), embedding=parameters.Embedding( local_path= "/home/dynomante/projects/covid-19-kaggle/w2v_parquet_file_new_version.parquet" )) embedding_model, all_db_sentences = prepare_data(params) query = "What do we know about Chloroquine to treat covid-19 induced by coronavirus?" closest_sentences_df = query_matching.get_k_closest_sentences( query=query, all_sentences=all_db_sentences, embedding_model=embedding_model, minimal_number_of_sentences=params.query.minimum_sentences_kept, similarity_threshold=params.query.cosine_similarity_threshold)
This script is an example on how to loop on multi queries to generate a MD report. """ import json from copy import deepcopy from time import time from c19 import (clusterise_sentences, display_output, embedding, parameters, query_matching) queries_path = "resources/queries.json" with open(queries_path) as json_file: queries_data = json.load(json_file) params = parameters.Parameters( database=parameters.Database( local_path="local_exec/articles_database_v12_16042020.sqlite", kaggle_data_path="local_exec/kaggle_data"), embedding=parameters.Embedding( local_path="resources/global_df_w2v_tfidf.parquet")) # Load pre-trained word vectors embedding_model = embedding.Embedding( parquet_embedding_path=params.embedding.local_path, embeddings_dimension=params.embedding.dimension, sentence_embedding_method=params.embedding.word_aggregation_method, weight_vectors=params.embedding.weight_with_tfidf) # Load sentences from SQLite all_db_sentences_original = query_matching.get_sentences_data( db_path=params.database.local_path) # Loop over re-formulated queries