Exemplo n.º 1
0
def get_params():

    # Get default parameters and adjust with user's input
    params = c19_parameters.Parameters(
        database=c19_parameters.Database(
            local_path="/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v14_02052020_test.sqlite"),
        embedding=c19_parameters.Embedding(
            local_path="/home/dynomante/projects/covid-19-kaggle/w2v_parquet_file_new_version.parquet")
    )

    try:
        params.query.cosine_similarity_threshold = float(request.form["sim_threshold"])
    except KeyError:
        pass
    try:
        params.query.minimum_sentences_kept = int(request.form["n_sentence"])
    except KeyError:
        pass
    try:
        params.query.number_of_clusters = int(request.form["number_cluster"])
    except KeyError:
        pass
    try:
        params.query.min_feature_per_cluster = int(request.form["feature_per_cluster"])
    except KeyError:
        pass

    return params
Exemplo n.º 2
0
def main(query):

    # Get parameters
    params = parameters.Parameters(database=parameters.Database(
        local_path="local_exec/articles_database_v8_07042020.sqlite",
        kaggle_data_path="local_exec/kaggle_data"))

    # Load pre-trained word vectors
    embedding_model = embedding.Embedding(
        parquet_embedding_path=params.embedding.local_path,
        embeddings_dimension=params.embedding.dimensions,
        sentence_embedding_method=params.embedding.word_aggregation_method,
        weight_vectors=params.embedding.weight_with_tfidf)

    # Get sentence data (including vector) from sentence table
    sentences = query_matching.get_sentences_data(
        db_path=params.database.local_path)

    # Find the K closest sentence to the query
    closest_sentences = query_matching.get_k_closest_sentences(
        db_path=params.database.local_path,
        query=query,
        sentences=sentences,
        embedding_model=embedding_model,
        k=params.query.top_k_sentences)
Exemplo n.º 3
0
def main():

    # Get parameters
    params = parameters.Parameters(first_launch=True)

    # Load all articles (title, abstract and body) into the 'article' table.
    database_utilities.create_db_and_load_articles(
        db_path=params.database.local_path,
        kaggle_data_path=params.database.kaggle_data_path,
        first_launch=params.first_launch,
        load_body=params.preprocessing.load_text_body)

    # Load pre-trained word vectors
    embedding_model = embedding.Embedding(
        parquet_embedding_path=params.embedding.local_path,
        embeddings_dimension=params.embedding.dimensions,
        sentence_embedding_method=params.embedding.word_aggregation_method,
        weight_vectors=params.embedding.weight_with_tfidf)

    # Pre-process and vectorise all sentences
    text_preprocessing.pre_process_and_vectorize_texts(
        embedding_model=embedding_model,
        db_path=params.database.local_path,
        first_launch=params.first_launch,
        stem_words=params.preprocessing.stem_words,
        remove_num=params.preprocessing.remove_numeric)
Exemplo n.º 4
0
def main():

    params = parameters.Parameters(database=parameters.Database(
        local_path="articles_database_v8_07042020.sqlite",
        kaggle_data_path="kaggle_data"))

    # Load all articles (title, abstract and body) into the 'article' table.
    database_utilities.create_db_and_load_articles(
        db_path=params.database.local_path,
        kaggle_data_path=params.database.kaggle_data_path,
        first_launch=params.first_launch,
        only_newest=params.database.only_newest,
        only_covid=params.database.only_covid,
        enable_data_cleaner=params.database.enable_data_cleaner)

    # Pre-process all sentences (no embedding)
    text_preprocessing.pre_process_and_vectorize_texts(
        embedding_model=None,
        db_path=params.database.local_path,
        first_launch=params.first_launch,
        stem_words=params.preprocessing.stem_words,
        remove_num=params.preprocessing.remove_numeric,
        batch_size=params.preprocessing.batch_size,
        max_body_sentences=params.preprocessing.max_body_sentences)

    # Param have been set up with: https://www.aclweb.org/anthology/W16-2922.pdf
    w2v_params = {
        "sg": 1,
        "hs": 1,
        "sample": 1e-5,
        "negative": 10,
        "min_count": 20,
        "size": 100,
        "window": 7,
        "seed": 42,
        "workers": os.cpu_count(),
        "iter": 10
    }

    # Train and save W2V and TFIDF as a parquet file DF.parquet
    word2vec = word2vec_utilities.W2V(params.database.local_path,
                                      tfidf_path="TFIDF.pkl",
                                      w2v_path="W2V.bin",
                                      w2v_params=w2v_params,
                                      parquet_output_path="DF.parquet")
    word2vec.train()
Exemplo n.º 5
0
def main():

    params = parameters.Parameters(
        first_launch=True,
        database=parameters.Database(
            local_path=
            "/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v13_01052020.sqlite",
            kaggle_data_path=
            "/home/dynomante/projects/covid-19-kaggle/local_exec/kaggle_data",
            only_newest=False,
            only_covid=False),
        preprocessing=parameters.PreProcessing(max_body_sentences=0,
                                               stem_words=False),
        # embedding=parameters.Embedding(
        #     local_path="resources/global_df_w2v_tfidf.parquet")
    )

    # Load all articles (title, abstract and body) into the 'article' table.
    database_utilities.create_db_and_load_articles(
        db_path=params.database.local_path,
        kaggle_data_path=params.database.kaggle_data_path,
        first_launch=params.first_launch,
        only_newest=params.database.only_newest,
        only_covid=params.database.only_covid,
        enable_data_cleaner=params.database.enable_data_cleaner)

    # Load pre-trained word vectors
    # embedding_model = embedding.Embedding(
    #     parquet_embedding_path=params.embedding.local_path,
    #     embeddings_dimension=params.embedding.dimension,
    #     sentence_embedding_method=params.embedding.word_aggregation_method,
    #     weight_vectors=params.embedding.weight_with_tfidf)
    embedding_model = None

    # Pre-process and vectorise all sentences
    text_preprocessing.pre_process_and_vectorize_texts(
        embedding_model=embedding_model,
        db_path=params.database.local_path,
        first_launch=params.first_launch,
        stem_words=params.preprocessing.stem_words,
        remove_num=params.preprocessing.remove_numeric,
        batch_size=params.preprocessing.batch_size,
        max_body_sentences=params.preprocessing.max_body_sentences)
Exemplo n.º 6
0
        weight_vectors=params.embedding.weight_with_tfidf)

    # Get sentence data (including vector) from sentence table
    all_db_sentences = query_matching.get_sentences_data(
        db_path=params.database.local_path)

    return embedding_model, all_db_sentences


if __name__ == "__main__":

    params = parameters.Parameters(
        database=parameters.Database(
            local_path=
            "/home/dynomante/projects/covid-19-kaggle/local_exec/articles_database_v14_02052020_test.sqlite",
            kaggle_data_path=
            "/home/dynomante/projects/covid-19-kaggle/local_exec/kaggle_data"),
        embedding=parameters.Embedding(
            local_path=
            "/home/dynomante/projects/covid-19-kaggle/w2v_parquet_file_new_version.parquet"
        ))

    embedding_model, all_db_sentences = prepare_data(params)

    query = "What do we know about Chloroquine to treat covid-19 induced by coronavirus?"

    closest_sentences_df = query_matching.get_k_closest_sentences(
        query=query,
        all_sentences=all_db_sentences,
        embedding_model=embedding_model,
        minimal_number_of_sentences=params.query.minimum_sentences_kept,
        similarity_threshold=params.query.cosine_similarity_threshold)
Exemplo n.º 7
0
This script is an example on how to loop on multi queries to generate a MD report.
"""
import json
from copy import deepcopy
from time import time

from c19 import (clusterise_sentences, display_output, embedding, parameters,
                 query_matching)

queries_path = "resources/queries.json"
with open(queries_path) as json_file:
    queries_data = json.load(json_file)

params = parameters.Parameters(
    database=parameters.Database(
        local_path="local_exec/articles_database_v12_16042020.sqlite",
        kaggle_data_path="local_exec/kaggle_data"),
    embedding=parameters.Embedding(
        local_path="resources/global_df_w2v_tfidf.parquet"))

# Load pre-trained word vectors
embedding_model = embedding.Embedding(
    parquet_embedding_path=params.embedding.local_path,
    embeddings_dimension=params.embedding.dimension,
    sentence_embedding_method=params.embedding.word_aggregation_method,
    weight_vectors=params.embedding.weight_with_tfidf)

# Load sentences from SQLite
all_db_sentences_original = query_matching.get_sentences_data(
    db_path=params.database.local_path)

# Loop over re-formulated queries