def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) for project in os.environ["BUGZILLA_PROJECT_NAME"].split(","): for year in range(int(os.environ['BUGZILLA_FIRST_CREATION_YEAR']), int(os.environ['BUGZILLA_LAST_CREATION_YEAR'])): cmd = f"python CreateBugzillaMongoDBCollection.py --p {project}" \ f" --db {os.environ['BUGZILLA_MONGODB_DATABASE_NAME']}" \ f" --c {project.lower()} --y {year} --sm 1 --em 12" # Ejecuta el comando en la consola Windows. check_output(cmd, shell=True) print(cmd) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load the parameters. input_params = get_input_params() log.info(f"Creating index on all collections ...") corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \ os.environ["CORPUS_NAME"].split(",") for corpus in corpus_list: tic = time.time() cmd = get_command(os.name, corpus, 'normalized_clear', 'creation_ts', -1) # Run command. log.info(f"Running command: '{cmd}'") os.system(cmd) final_time = time.time() log.info( f"Creating index on all collections total execution time = {((final_time - initial_time) / 60)} minutes" ) log.info(f"MODULE EXECUTED.")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ["JIRA_MONGODB_DATABASE_NAME"]] col = db[os.environ["JIRA_STATISTICS_MONGODB_COLLECTION_NAME"]] # Inicializa el resultado. statistics_dict = {} for project in os.environ["JIRA_PROJECT_NAME"].split(","): statistics_dict["project"] = project total_bugs = 0 statistics_json = {} for year in range(int(os.environ["JIRA_FIRST_CREATION_YEAR"]), datetime.datetime.now().year + 1): statistics_dict[year] = {} statistics_dict[year]["_total"] = 0 total_bugs_year = 0 # month_statistics_dict = {} for month in range(1, 13): max_year = year max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 issues = get_issues_by_date_range( project=project, min_created_date=f"{year}-{str(month).zfill(2)}-01", max_creation_date=f"{max_year}-{str(max_month).zfill(2)}-01", max_results=-False, fields="id" ) # Número total de incidencias del mes analizado. # month_statistics_dict.append({"month": month, "count": len(bugs)}) # month_statistics_dict.append({month: len(bugs)}) statistics_dict[year][month] = len(issues) total_bugs_year += len(issues) # Número total de incidencias del año analizado. statistics_dict[year]["_total"] = total_bugs_year # Estadísticas mensuales. # statistics_dict[year]["bugs"] = month_statistics_dict log.info(json.dumps(statistics_dict)) # Número total de incidencias del proyecto. total_bugs += total_bugs_year statistics_dict["_total"] = total_bugs # Almacena las estadísticas para el proyecto. statistics_json = json.dumps(statistics_dict) col.insert_one(json.loads(statistics_json))
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) for project in os.environ["GERRIT_PROJECT_NAME"].split(","): for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']), int(os.environ['GERRIT_LAST_CREATION_YEAR'])): tic = time.time() log.info(f"Retrieving Gerrit issues for year '{year}' ...") cmd = get_command(os.name, project, year) # Run command. log.info(f"Running command: '{cmd}'") os.system(cmd) log.info( f"Retrieving Gerrit issues execution time = {((time.time() - tic) / 60)} minutes" ) final_time = time.time() log.info( f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ['GERRIT_DB_NAME']] log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) for project in os.environ["GERRIT_PROJECT_NAME"].split(","): for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']), int(os.environ['GERRIT_LAST_CREATION_YEAR'])): col = db[f"{project.lower()}_{year}_{year + 1}"] if col.name in db.list_collection_names(): log.info(f"Dropping collection {col.name} ...") db.drop_collection(col.name) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger log.info(f"INICIO DE LA EJECUCIÓN") env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Incializa las variables que almacenarán los argumentos de entrada. input_params = get_input_params() dup = TreeLstmDuplicateTrain(corpus='bugzilla', collection='clear', attention=False, attention_size=10, glove_size=100, hidden_size=100, max_input=200, batch_size=1, optimizer='ADAM', learning_rate=0.001, update_embeddings=True, patience=5).load_or_run() output_dir = 'resultados/dump'
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load the parameters. input_params = get_input_params() log.info(f"Encoding all pairs labels ...") corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \ os.environ["CORPUS_NAME"].split(",") for corpus in corpus_list: log.info(f"Encoding pairs label for corpus: '{corpus}' ...") tic = time.time() cmd = get_command(os.name, corpus, ','.join(get_pairs_collection_name(corpus))) # Run command. log.info(f"Running command: '{cmd}'") os.system(cmd) log.info( f"Encoding pairs label total execution time = {((time.time() - tic) / 60)} minutes" ) final_time = time.time() log.info( f"Encoding all pairs labels total execution time = {((final_time - initial_time) / 60)} minutes" ) log.info(f"MODULE EXECUTED.")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() order = 'ascending' if input_params['order'] == 1 else 'descending' log.info( f"Creating {order} index on field: '{input_params['db_name']}.{input_params['collection_name']}." f"{input_params['field_name']}' ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['db_name']] col = db[input_params['collection_name']] col.create_index([ (input_params['field_name'], DESCENDING if input_params['order'] == -1 else ASCENDING) ]) final_time = time.time() log.info( f"Creating {order} index on field: '{input_params['field_name']}' total execution time = " f"{((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) for project in os.environ["JIRA_PROJECT_NAME"].split(","): for year in range(2001, 2021): cmd = f"mongoimport /host {os.environ['MONGO_HOST_IP']} /port {os.environ['MONGO_PORT']}" \ f" /username {os.environ['MONGO_USERNAME']} /password {os.environ['MONGO_PASSWORD']}" \ f" /authenticationDatabase admin /authenticationMechanism SCRAM-SHA-1" \ f" /db {os.environ['JIRA_MONGODB_DATABASE_NAME']}" \ f" /collection {project.lower()}_all" \ f" /file {Path(ROOT_DIR) / 'data' / project.lower()}_{year}_{year + 1}.json" # Ejecuta el comando en la consola Windows. check_output(cmd, shell=True) print(cmd) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ['JIRA_MONGODB_DATABASE_NAME']] log.debug(f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}") for project in os.environ["JIRA_PROJECT_NAME"].split(","): for year in range(2001, 2021): col = db[f"{project.lower()}_{year}_{year + 1}"] if col.name in db.list_collection_names(): db.drop_collection(col.name) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Incializa las variables que almacenarán los argumentos de entrada. input_params = get_input_params() # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params.db_name] source_collection = db[input_params.collection_name] target_collection = db[f"{input_params.collection_name}_embeddings"] log.debug( f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}" ) if target_collection.name in db.list_collection_names( ) and input_params.drop_collection: db.drop_collection(target_collection.name) cursor = source_collection.find({}, { "creation_ts": "$creation_time", "short_desc": "$summary", "bug_status": "$status", "bug_id": "$id", "dup_id": "$dupe_of", "resolution": 1, "version": 1, "product": 1, "priority": 1, "component": 1, "delta_ts": 1, "bug_severity": "$severity", "description": "$comments.0", "normalized_short_desc": 1, "normalized_description": 1, "comments": { "$slice": 1 } }) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Incializa las variables que almacenarán los argumentos de entrada. input_params = get_input_params() # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['mongo_params'].db_name] col = db[ f"{input_params['mongo_params'].collection_name}" f"_{input_params['bz_api_params'].year}_{input_params['bz_api_params'].year + 1}"] log.debug( f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}" ) if col.name in db.list_collection_names( ) and input_params['mongo_params'].drop_collection: db.drop_collection(col.name) # Para cada año recupera las incidencias utilizando la API de Bugzilla. max_year = input_params['bz_api_params'].year for month in range(input_params['bz_api_params'].start_month, input_params['bz_api_params'].end_month + 1): max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 bugs = get_bugzilla_bugs_by_date_range( project=input_params['bz_api_params'].project, min_creation_ts= f"{input_params['bz_api_params'].year}-{str(month).zfill(2)}-01", max_creation_ts=f"{max_year}-{str(max_month).zfill(2)}-01", max_results=input_params['bz_api_params'].query_limit, include_fields=input_params['bz_api_params'].include_fields, get_comments=input_params['bz_api_params'].get_comments) save_issues_to_mongodb(mongodb_collection=col, issues=bugs) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] eclipse_base_col = db["eclipse_base"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = {"_id": 0} eclipse_base_data = eclipse_base_col.find({}, fields) # .limit(20) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(eclipse_base_data)) eclipse_base_summary_col = db["clear_description_nlp_2"] eclipse_base_summary_data = eclipse_base_summary_col.find({}, fields) eclipse_base_summary = pd.DataFrame(list(eclipse_base_summary_data)) eclipse_base_data_description_col = db["clear_short_desc_nlp"] eclipse_base_data_description_data = eclipse_base_data_description_col.find( {}, fields) eclipse_base_data_description = pd.DataFrame( list(eclipse_base_data_description_data)) # Merge de los dataframes eclipse_base_summary_description = pd.merge(eclipse_base_summary, eclipse_base_data_description, on='id') eclipse_base_clear = pd.merge(eclipse_base, eclipse_base_summary_description, on='id') # Almacena el dataframe en MongoDB. db["eclipse_base_clear"].insert_many(eclipse_base_clear.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] gerrit_collections = db.list_collection_names() log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) for project in os.environ["GERRIT_PROJECT_NAME"].split(","): df_list = [] for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']), int(os.environ['GERRIT_LAST_CREATION_YEAR'])): col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"] if col.name in gerrit_collections: tic = time.time() log.info(f"Retrieving Gerrit issues for year '{year}' ...") data = col.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) log.info(f"Gerrit issues for year '{year}': {df.shape[0]}") df_list.append(df) log.info( f"Retrieving Gerrit issues for year '{year}' execution time = {((time.time() - tic) / 60)} minutes" ) df_concatenated = pd.concat(df_list) table_dict = df_concatenated.to_dict("records") if project.lower() in gerrit_collections: db.drop_collection(project.lower()) db[project.lower()].insert_many(table_dict) final_time = time.time() log.info( f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Logger. log = set_logger() log.debug(f"\n[START OF EXECUTION]") load_environment_variables() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Input parameters. input_params = get_input_params() # Defines javac executable. java_exe = Path(os.environ['JAVA_HOME']) / 'bin' / 'java.exe' # Command. cmd = f"{java_exe} -cp {get_java_classpath()} {input_params['nlp_params'].java_class_name} " \ f"-host {input_params['mongo_params'].host} " \ f"-port {input_params['mongo_params'].port} " \ f"-dbName {input_params['mongo_params'].db_name} " \ f"-collName {input_params['mongo_params'].collection_name} " \ f"-startYear {input_params['filter_params'].start_year} " \ f"-endYear {input_params['filter_params'].end_year} " \ f"-textColumnName {input_params['filter_params'].column_name} " \ f"-maxNumTokens {input_params['nlp_params'].max_num_tokens} " \ f"-parserModel {input_params['nlp_params'].parser_model} " \ f"-createTrees {input_params['nlp_params'].get_trees} " \ f"-calcEmbeddings {input_params['nlp_params'].get_embeddings} " \ f"-calcCoherence {input_params['nlp_params'].get_coherence}" log.info(f"Running command: '{cmd}'") # Run command. os.system(cmd) log.info(f"\n[END OF EXECUTION]") final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load the parameters. input_params = get_input_params() log.info(f"Filtering all word embeddings ...") corpus_list = input_params['corpus_list'] if '' != input_params['corpus_list'] else \ os.environ["CORPUS_NAME"].split(",") embeddings_model_list = input_params['embeddings_model_list'] if '' != input_params['embeddings_model_list'] else \ os.environ["EMBEDDINGS_MODEL"].split(",") embeddings_size_list = input_params['embeddings_size_list'] if '' != input_params['embeddings_size_list'] else \ os.environ["EMBEDDINGS_SIZE"].split(",") for corpus in corpus_list: log.info(f"Filtering word embeddings for corpus: '{corpus}' ...") for model in embeddings_model_list: log.info(f"Filtering word embeddings for model: '{model}' ...") for size in embeddings_size_list: tic = time.time() log.info( f"Filtering pre-trained word embeddings of size: '{int(size)}' ..." ) cmd = get_command(os.name, corpus, model, int(size), True) # Run command. log.info(f"Running command: '{cmd}'") os.system(cmd) log.info( f"Filtering pre-trained word embeddings total execution time " f"= {((time.time() - tic) / 60)} minutes") final_time = time.time() log.info( f"Filtering all word embeddings total execution time = {((final_time - initial_time) / 60)} minutes" ) log.info(f"MODULE EXECUTED.")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load the parameters. input_params = get_input_params() log.info(f"Building all datasets ...") task_list = input_params['task_list'].split(",") if '' != input_params['task_list'] else \ os.environ["TASK_NAME"].split(",") corpus_list = input_params['corpus_list'].split(",") if '' != input_params['corpus_list'] else \ os.environ["CORPUS_NAME"].split(",") for corpus in corpus_list: log.info(f"Building datasets for corpus: '{corpus}' ...") for task in task_list: tic = time.time() log.info(f"Building datasets for task: '{task}' ...") cmd1 = get_command(os.name, task, corpus, False) cmd2 = get_command(os.name, task, corpus, True) # Run command. log.info(f"Running command: '{cmd1}'") os.system(cmd1) log.info(f"Building unbalanced dataset total execution time = {((time.time() - tic) / 60)} minutes") tic = time.time() log.info(f"Running command: '{cmd2}'") os.system(cmd2) log.info(f"Building balanced dataset total execution time = {((time.time() - tic) / 60)} minutes") final_time = time.time() log.info(f"Building all datasets total execution time = {((final_time - initial_time) / 60)} minutes") log.info(f"MODULE EXECUTED.")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Processing Gerrit issues ...") # MongoDB data. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] col = db[input_params['collection_name']] data = col.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) df['bug_id'] = -1 # Search for bugs that don't matches this pattern. bug_id_list = [] loop = tqdm(range(df.shape[0])) for i in loop: matched_1 = re.search(r'\[\s*\+?(-?\d+)\s*]', df.loc[i, 'subject']) matched_2 = re.search(r'[Bb][Uu][Gg]\s[0-9]+', df.loc[i, 'subject']) is_match = bool(matched_1) + bool(matched_2) if bool(is_match): flag = True res = re.findall(r'[Bb][Uu][Gg]\s[0-9]+', df.iloc[i]['subject']) if not res: flag = False res = re.findall(r'\[\s*\+?(-?\d+)\s*]', df.iloc[i]['subject']) bug_id = int(res[0][4:]) if flag else int(res[0]) if bug_id in bug_id_list: idx = df.index[df['bug_id'] == bug_id].tolist() if len(idx) > 1: raise ValueError( f"There are more than one bug with 'bug_id' = {bug_id}" ) previous_file_list = get_filtered_file_list( df.iloc[idx[0]]['file_list']) current_file_list = get_filtered_file_list( df.iloc[i]['file_list']) file_list = set(previous_file_list + current_file_list) df.at[idx[0], 'file_list'] = list(file_list) else: bug_id_list.append(bug_id) df.at[i, 'bug_id'] = bug_id df.at[i, 'file_list'] = get_filtered_file_list( df.iloc[i]["file_list"]) # Drop collection if already exists. if input_params['output_collection_name'] in db.list_collection_names(): log.info( f"Dropping collection {input_params['output_collection_name']} ..." ) db.drop_collection(input_params['output_collection_name']) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df.shape[0] / input_params['batch_size']) batches = np.array_split(df.loc[df['bug_id'] != -1], num_batches) # Insert documents with bug_id in MongoDB. inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = db[ input_params['output_collection_name']].insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) final_time = time.time() log.info( f"Processing Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding similar issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] collection = db[input_params['collection_name']] data = collection.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) # Check empty Dataframe. if 0 == df.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{collection.name}' collection.") # Initialize empty list column. df['sim_bugs'] = [[] for i in range(len(df))] df['near_bugs'] = [[] for i in range(len(df))] # Iterate over all rows. all_sim_bugs = [] all_near_bugs = [] for i in tqdm(range(df.shape[0])): # Compare each row with all rows. for j in range(df.shape[0]): if j == i: continue bug_anc = df.at[i, 'bug_id'] bug_pos = df.loc[j, 'bug_id'] jaccard_similarity = jaccard_score(df.loc[i, 'file_list'], df.loc[j, 'file_list']) if jaccard_similarity >= float( input_params['similarity_threshold']): if [bug_anc, bug_pos] not in all_sim_bugs and [ bug_pos, bug_anc ] not in all_sim_bugs: df.at[i, 'sim_bugs'].append(int(bug_pos)) all_sim_bugs.append([bug_anc, bug_pos]) if float(input_params['similarity_threshold']) - 0.25 <= jaccard_similarity < \ float(input_params['similarity_threshold']): if [bug_anc, bug_pos] not in all_near_bugs and [ bug_pos, bug_anc ] not in all_near_bugs: df.at[i, 'near_bugs'].append(int(bug_pos)) all_near_bugs.append([bug_anc, bug_pos]) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df.shape[0] / input_params['batch_size']) batches = np.array_split(df, num_batches) output_collection = db[input_params['output_collection_name']] # Drop collection if already exists. if output_collection.name in db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding duplicate issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Bugzilla data. db = mongodb_client[input_params['input_db_name']] # Collection all issues (duplicated and no duplicated). original_collection = db[input_params['original_collection_name']] query = {} projection = {'_id': 0, 'bug_id': 1} log.info(f"Reading data from '{db.name}.{original_collection.name}' ...") original_data = original_collection.find(query, projection) df_original = pd.DataFrame(list(original_data)) # Check empty Dataframe. if 0 == df_original.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{original_collection.name}' collection.") # List with all bug_id. original_bug_id_list = df_original['bug_id'].to_list() collection_pairs_first_step = db[input_params['input_collection_name']] query = {} projection = {'_id': 0} data_pairs_first_step = collection_pairs_first_step.find(query, projection) df_pairs_first_step = pd.DataFrame(list(data_pairs_first_step)) # Check empty Dataframe. if 0 == df_pairs_first_step.shape[0]: raise ValueError( f"No documents have been retrieved from " f"'{db.name}.{collection_pairs_first_step.name}' collection.") bug1_list = df_pairs_first_step['bug1'].to_list() bug2_list = df_pairs_first_step['bug2'].to_list() no_duplicate_bug_id_list = set(original_bug_id_list) - set( bug1_list) - set(bug2_list) # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues. pairs_from_indirect_relations = check_indirect_relations( df_pairs_first_step, list(no_duplicate_bug_id_list)) log.info(f"Pairs generated: {len(pairs_from_indirect_relations)}") # Dataframe pairs. df_pairs = pd.DataFrame(pairs_from_indirect_relations) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size']) batches = np.array_split(df_pairs, num_batches) output_collection = db[input_params['output_collection_name']] # Drop collection if already exists. if output_collection.name in db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["eclipse"] col = db["clear"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "bug_id": 1, # "product": 1, # "description": 1, # "bug_severity": 1, # "dup_id": 1, "short_desc": 1, # "priority": 1, # "version": 1, # "component": 1, # "delta_ts": 1, "bug_status": 1 # "creation_ts": 1, # "resolution": 1 } # Se utilizarán sólo las incidencias resueltas. query = { '$or': [{ 'bug_status': { '$eq': 'RESOLVED' } }, { 'bug_status': { '$eq': 'VERIFIED' } }] } data = col.find(query, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear_nlp = pd.DataFrame(list(data)) print(clear_nlp) nltk.download("stopwords", quiet=True) clear_nlp["short_desc_split_alpha"] = clear_nlp["short_desc"].apply( lambda x: clean_doc_split(x)) clear_nlp["short_desc_lower"] = clear_nlp["short_desc_split_alpha"].apply( lambda x: clean_doc_lower(x)) clear_nlp["short_desc_punctuaction"] = clear_nlp["short_desc_lower"].apply( lambda x: clean_doc_punctuaction(x)) clear_nlp["short_desc_trim"] = clear_nlp["short_desc_punctuaction"].apply( lambda x: clean_doc_trim(x)) clear_nlp["short_desc_isalpha"] = clear_nlp["short_desc_trim"].apply( lambda x: clean_doc_isalpha(x)) clear_nlp["short_desc_stop_words"] = clear_nlp["short_desc_isalpha"].apply( lambda x: clean_doc_stopW(x)) clear_nlp["short_desc_diacritic"] = clear_nlp[ "short_desc_stop_words"].apply(lambda x: clean_doc_diacri(x)) clear_nlp["short_desc_lemmatizer"] = clear_nlp[ "short_desc_diacritic"].apply(lambda x: clean_doc_lem(x)) # Elimina la columna 'bug_status' porque se realizará después un merge con la colección original. clear_nlp.drop('bug_status', axis=1, inplace=True) # Almacena el dataframe en MongoDB. db["clear_short_desc_nlp"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_all"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "assigned_to": 1, "assigned_to_detail": 1, "classification": 1, "component": 1, "creation_time": 1, "creator": 1, "creator_detail": 1, "dupe_of": 1, "id": 1, "op_sys": 1, "platform": 1, "priority": 1, "product": 1, "resolution": 1, "severity": 1, "status": 1, "summary": 1, "version": 1, "description": "$comments.text" } aggregation_project = {"$project": fields} # Se utilizarán sólo las incidencias resueltas. aggregation_match = {"$match": { '$and': [ { '$or': [ {'status': {'$eq': 'RESOLVED'}}, {'status': {'$eq': 'VERIFIED'}}, {'status': {'$eq': 'CLOSED'}} ] }, {'comments': {'$exists': 'true'}}, {'comments': {'$ne': 'null'}}, {'comments': {'$ne': ""}}, {'comments': {'$not': {'$size': 0}}}, {'comments.count': {'$eq': 0}} ] }} aggregation_unwind = {"$unwind": "$comments"} aggregation_limit = {"$limit": 20} # data = col.find(query, fields).limit(20) data = col.aggregate([ aggregation_unwind, aggregation_match, # aggregation_limit, aggregation_project ]) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(data)) # Almacena el dataframe en MongoDB. db["eclipse_base"].insert_many(eclipse_base.to_dict('records')) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Logger. log = set_logger() log.debug(f"\n[START OF EXECUTION]") load_environment_variables() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Years range. input_params = get_input_params() # Databases. databases = [input_params['corpus']] if input_params['corpus'] != '' \ else os.environ["EMBEDDING_MONGODB_DATABASE_NAME"].split(",") # Java class. java_class_name = "UpdateMongoDBNLPFields" # Control params. model_param = f"--pm {'corenlp'}" if (input_params['get-coherence'] and input_params['get-trees']) \ else f"--pm {'srparser'}" trees_param = "--get-trees" if input_params['get-trees'] else "--no-get-trees" embeddings_param = "--get-embeddings " if (input_params['get-embeddings'] and input_params['get-trees']) \ else "--no-get-embeddings" coherence_param = "--get-coherence" if (input_params['get-coherence'] and input_params['get-trees']) \ else "--no-get-coherence" # Defines Python executable. python_exe = os.environ.get('PYTHON_EXECUTABLE', sys.executable) # Loop for obtain tokens number. tokens_initial_time = time.time() log.info(f"Updating NLP fields ...") for db in databases: log.info(f"\nProcessing database: '{db}'.") for year in range(input_params['start_year'], input_params['end_year']): log.info(f"\n[FOR LOOP] Processing years: {year} - {year + 1}") cmd = f"{python_exe} UpdateVectorizedMongoDBCollection.py --jcn {java_class_name}" \ f" --mh {os.environ['MONGO_HOST_IP']}" \ f" --mp {os.environ['MONGO_PORT']}" \ f" --db {db}" \ f" --c {os.environ['EMBEDDING_MONGODB_COLLECTION_NAME']}" \ f" --cl {os.environ['EMBEDDING_MONGODB_COLUMN_NAME']}" \ f" --sy {year}" \ f" --ey {year + 1} " \ f"--mnt {os.environ['EMBEDDING_MONGODB_MAX_NUM_TOKENS']} " \ f"{model_param} {trees_param} {embeddings_param} {coherence_param}" # Run command. log.info(f"Running command: '{cmd}'.") os.system(cmd) log.info(f"Updating NLP fields total execution time = {((time.time() - tokens_initial_time) / 60)} minutes") log.debug(f"\n[END OF EXECUTION]") final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_all"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "assigned_to": 1, "assigned_to_detail": 1, "classification": 1, "component": 1, "creation_time": 1, "creator": 1, "creator_detail": 1, "dupe_of": 1, "id": 1, "op_sys": 1, "platform": 1, "priority": 1, "product": 1, "resolution": 1, "severity": 1, "status": 1, "summary": 1, "version": 1, "description": "$comments.text" } aggregation_project = {"$project": fields} # Se utilizarán sólo las incidencias resueltas. aggregation_match = {"$match": { '$and': [ { '$or': [ {'status': {'$eq': 'RESOLVED'}}, {'status': {'$eq': 'VERIFIED'}}, {'status': {'$eq': 'CLOSED'}} ] }, {'comments.count': {'$eq': 0}} ] }} aggregation_unwind = {"$unwind": "$comments"} aggregation_limit = {"$limit": 20} # data = col.find(query, fields).limit(20) data = col.aggregate([ aggregation_unwind, aggregation_match, aggregation_limit, aggregation_project ]) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(data)) print(eclipse_base.head(10)) # 1º-Convertir la variable en tipo categórica: eclipse_base.priority = eclipse_base.priority.astype('category') print(eclipse_base.head(10)) # 2º-Catergorizar: eclipse_base['priority_cod'] = eclipse_base['priority'].cat.codes print(eclipse_base.head(10)) # 1- Primero creas una instancia: # le = sklearn.preprocessing.LabelEncoder() # 2- Después ajustas a tus datos: # le.fit(labels) (en este caso, "labels = data[:,0]" era la columna con las predicciones). # 3- Obtienes la columna con los valores transformados: # labels = le.transform(labels) # 4- Puedes guardar los valores originales para establecer una relación: # class_names = le.classes_ # df["creation_time_year"] = df["creation_time"].str[:4] # df["last_change_time_year"] = df["last_change_time"].str[:4] # df["resolution_string"] = df["resolution"].apply(lambda y: "EMPTY_FIELD" if len(y) == 0 else y) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] year = input_params['year'] col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"] log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) if col.name in db.list_collection_names(): db.drop_collection(col.name) # Get Gerrit Issues by month. max_year = year for month in range(input_params['start_month'], input_params['end_month'] + 1): max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 after = str(datetime.datetime(input_params['year'], month, 1)) before = str(datetime.datetime(max_year, max_month, 1)) s = 0 change_id, project, status, date, subject, author, committer, commit_msg = [], [], [], [], [], [], [], [] file_list = [] skip = '' flag = True log.info(f"Retrieving issues in range: {after} - {before}") while flag: issues_by_month = get_data_from_gerrit_rest_api( os.environ['GERRIT_API_URL'], after, before, skip=skip) log.info(f"Read issues: {len(issues_by_month)}") if len(issues_by_month) < 1: print('No issues for month {} and year {}'.format(month, year)) flag = False else: for issue in issues_by_month: change_id.append(issue['change_id']) project.append(issue['project']) status.append(issue['status']) date.append(issue['updated']) subject.append(issue['subject']) current_rev = issue['current_revision'] author.append(issue['revisions'][current_rev]['commit'] ['author']['email']) committer.append(issue['revisions'][current_rev]['commit'] ['committer']['email']) commit_msg.append( issue['revisions'][current_rev]['commit']['message']) file_list.append( list(issue['revisions'][current_rev]['files'].keys())) if len(issues_by_month) == 100: last_dict = issues_by_month[99] try: if last_dict['_more_changes']: log.info(f"There are more issues to read") s = s + 100 skip = '&S=' + str(s) except KeyError: flag = False print('Issues for month {} and year {}: {}'.format( month, year, len(change_id))) else: print('Issues for month {} and year {}: {}'.format( month, year, len(change_id))) flag = False dict_of_reg = { 'change_id': change_id, 'project': project, 'status': status, 'date': date, 'subject': subject, 'author': author, 'committer': committer, 'commit_msg': commit_msg, 'file_list': file_list } df = pd.DataFrame(dict_of_reg) issues = df.to_dict("records") save_issues_to_mongodb(mongodb_collection=col, issues=issues) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutes")
"""Perform hyperparemeters fit""" import os import time from syn.helpers.hyperparams import get_input_params from syn.helpers.logging import set_logger from syn.helpers.system import check_same_python_module_already_running from syn.model.build.common.task import ConsensusFit log = set_logger() if __name__ == "__main__": initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) log.info(f"Fitting hyperparameters ...") # Load parameter space. input_param_space = get_input_params() assert input_param_space is not None, f"No param space provided." fitter = ConsensusFit( database_name='tasks', collection_name='experiments', corpus='openOffice', tasks_objectives={ 'duplicity': 'accuracy', 'prioritization': 'jaccard_micro' },
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Merging Gerrit and Bugzilla issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. log.info(f"Loading Gerrit issues ...") tic = time.time() gerrit_db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['gerrit_db_name'])] gerrit_collection = gerrit_db[input_params['gerrit_collection_name']] gerrit_data = gerrit_collection.find({}, {'_id': 0}) df_gerrit = pd.DataFrame(list(gerrit_data)) log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.") # Check empty Dataframe. if 0 == df_gerrit.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{gerrit_db.name}.{gerrit_collection.name}' collection.") # Load Bugzilla data. log.info(f"Loading Bugzilla issues ...") tic = time.time() bugzilla_db = mongodb_client[os.environ.get('BUGZILLA_MONGODB_DATABASE_NAME', input_params['bugzilla_db_name'])] bugzilla_collection = bugzilla_db[input_params['bugzilla_collection_name']] bugzilla_data = bugzilla_collection.find({}, {'_id': 0, 'bug_id': 1}) df_bugzilla = pd.DataFrame(list(bugzilla_data)) log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.") # Check empty Dataframe. if 0 == df_bugzilla.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{bugzilla_db.name}.{bugzilla_collection.name}' collection.") # Join on column 'bug_id'. log.info(f"Joining Gerrit and Bugzilla Dataframes ...") tic = time.time() df_joined = df_gerrit.merge(df_bugzilla, left_on='bug_id', right_on='bug_id') log.info(f"Joining Gerrit and Bugzilla Dataframes total time: {(time.time() - tic) / 60} minutes.") # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_joined.shape[0] / input_params['batch_size']) batches = np.array_split(df_joined, num_batches) output_collection = gerrit_db[input_params['output_collection_name']] # Drop collection if already exists. if input_params['output_collection_name'] in gerrit_db.list_collection_names(): log.info(f"Dropping collection {input_params['output_collection_name']} ...") gerrit_db.drop_collection(input_params['output_collection_name']) inserted_docs_number = 0 tic = time.time() for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many(batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") log.info(f"Inserting documents total time: {(time.time() - tic) / 60} minutes.") final_time = time.time() log.info(f"Merging Gerrit and Bugzilla issues total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["eclipse"] clear_col = db["clear"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, } # Se utilizarán sólo las incidencias resueltas. query = { '$or': [{ 'bug_status': { '$eq': 'RESOLVED' } }, { 'bug_status': { '$eq': 'VERIFIED' } }] } clear_data = clear_col.find(query, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear = pd.DataFrame(list(clear_data)) clear_short_desc_nlp_col = db["clear_short_desc_nlp"] clear_short_desc_nlp_data = clear_short_desc_nlp_col.find({}, fields) clear_short_desc_nlp = pd.DataFrame(list(clear_short_desc_nlp_data)) clear_description_nlp_col = db["clear_description_nlp_2"] clear_description_nlp_data = clear_description_nlp_col.find({}, fields) clear_description_nlp = pd.DataFrame(list(clear_description_nlp_data)) # Merge de los dataframes clear_short_desc_description_nlp = pd.merge(clear_short_desc_nlp, clear_description_nlp, on='bug_id') clear_nlp = pd.merge(clear, clear_short_desc_description_nlp, on='bug_id') # Almacena el dataframe en MongoDB. db["clear_nlp"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding similar issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] collection = db[input_params['collection_name']] data = collection.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) # Check empty Dataframe. if 0 == df.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{collection.name}' collection.") similar_column_name = 'sim_bugs' if not input_params[ 'near_issues'] else 'near_bugs' # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues. pairs = generate_pairs(df, similar_column_name) log.info(f"Pairs generated: {len(pairs)}") # Dataframe pairs. df_pairs = pd.DataFrame(pairs) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size']) batches = np.array_split(df_pairs, num_batches) output_db = mongodb_client[input_params['output_db_name']] output_collection = output_db[input_params['output_similar_collection_name']] if not input_params['near_issues'] \ else output_db[input_params['output_near_collection_name']] # Drop collection if already exists. if output_collection.name in output_db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_base"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, # "assigned_to": 1, # "assigned_to_detail": 1, # "classification": 1, # "component": 1, # "creation_time": 1, # "creator": 1, # "creator_detail": 1, # "dupe_of": 1, "id": 1, # "op_sys": 1, # "platform": 1, # "priority": 1, # "product": 1, # "resolution": 1, # "severity": 1, # "status": 1, "summary": 1 # "version": 1, # "description": 1 } data = col.find({}, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear_nlp = pd.DataFrame(list(data)) print(clear_nlp) nltk.download("stopwords", quiet=True) clear_nlp["summary_split_alpha"] = clear_nlp["summary"].apply( lambda x: clean_doc_split(x)) clear_nlp["summary_lower"] = clear_nlp["summary_split_alpha"].apply( lambda x: clean_doc_lower(x)) clear_nlp["summary_punctuaction"] = clear_nlp["summary_lower"].apply( lambda x: clean_doc_punctuaction(x)) clear_nlp["summary_trim"] = clear_nlp["summary_punctuaction"].apply( lambda x: clean_doc_trim(x)) clear_nlp["summary_isalpha"] = clear_nlp["summary_trim"].apply( lambda x: clean_doc_isalpha(x)) clear_nlp["summary_stop_words"] = clear_nlp["summary_isalpha"].apply( lambda x: clean_doc_stopW(x)) clear_nlp["summary_diacritic"] = clear_nlp["summary_stop_words"].apply( lambda x: clean_doc_diacri(x)) clear_nlp["summary_lemmatizer"] = clear_nlp["summary_diacritic"].apply( lambda x: clean_doc_lem(x)) # Almacena el dataframe en MongoDB. db["eclipse_base_summary_clear"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")