def _db_store(self): """Store the task in the db""" log.info(f"Storing data in MongoDB ...") initial_time = time.time() client: MongoClient = get_default_mongo_client() query = {'task_id': self.task_id} log.info(f"query: {query}") document = { '$set': { 'task_id': self.task_id, 'task_name': self.task_name, 'task_action': self.task_action } } result = client[self.save_dbname][self.save_collection].update_one( query, # Query parameter document, upsert=True # Options ) log.info( f"Matched document with 'task_id' equals to '{self.task_id}': {result.matched_count}" ) log.info( f"Modified document with 'task_id' equals to '{self.task_id}': {result.modified_count}" ) final_time = time.time() log.info( f"Storing data in MongoDB total time: {((final_time - initial_time) / 60)} minutes" )
def get_attention_vector_raw_data(db_name, col_name, categorical=True, column=None): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger log.debug(f"\n[INICIO EJECUCIÓN]") env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else get_default_local_mongo_client() db = mongodb_client[db_name] col = db[col_name] if not categorical: query, fields = get_raw_data_query_and_projection() fields["constituents_embeddings_description1"] = 1 fields["constituents_embeddings_description2"] = 1 query["constituents_embeddings_description1.0"] = {'$exists': True} query["constituents_embeddings_description2.0"] = {'$exists': True} else: query, fields = get_raw_data_query_and_projection_categorical( column=column) fields["constituents_embeddings"] = 1 query["constituents_embeddings.0"] = {'$exists': True} return col.find(query, fields)
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ['JIRA_MONGODB_DATABASE_NAME']] log.debug(f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}") for project in os.environ["JIRA_PROJECT_NAME"].split(","): for year in range(2001, 2021): col = db[f"{project.lower()}_{year}_{year + 1}"] if col.name in db.list_collection_names(): db.drop_collection(col.name) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ['GERRIT_DB_NAME']] log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) for project in os.environ["GERRIT_PROJECT_NAME"].split(","): for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']), int(os.environ['GERRIT_LAST_CREATION_YEAR'])): col = db[f"{project.lower()}_{year}_{year + 1}"] if col.name in db.list_collection_names(): log.info(f"Dropping collection {col.name} ...") db.drop_collection(col.name) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ["JIRA_MONGODB_DATABASE_NAME"]] col = db[os.environ["JIRA_STATISTICS_MONGODB_COLLECTION_NAME"]] # Inicializa el resultado. statistics_dict = {} for project in os.environ["JIRA_PROJECT_NAME"].split(","): statistics_dict["project"] = project total_bugs = 0 statistics_json = {} for year in range(int(os.environ["JIRA_FIRST_CREATION_YEAR"]), datetime.datetime.now().year + 1): statistics_dict[year] = {} statistics_dict[year]["_total"] = 0 total_bugs_year = 0 # month_statistics_dict = {} for month in range(1, 13): max_year = year max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 issues = get_issues_by_date_range( project=project, min_created_date=f"{year}-{str(month).zfill(2)}-01", max_creation_date=f"{max_year}-{str(max_month).zfill(2)}-01", max_results=-False, fields="id" ) # Número total de incidencias del mes analizado. # month_statistics_dict.append({"month": month, "count": len(bugs)}) # month_statistics_dict.append({month: len(bugs)}) statistics_dict[year][month] = len(issues) total_bugs_year += len(issues) # Número total de incidencias del año analizado. statistics_dict[year]["_total"] = total_bugs_year # Estadísticas mensuales. # statistics_dict[year]["bugs"] = month_statistics_dict log.info(json.dumps(statistics_dict)) # Número total de incidencias del proyecto. total_bugs += total_bugs_year statistics_dict["_total"] = total_bugs # Almacena las estadísticas para el proyecto. statistics_json = json.dumps(statistics_dict) col.insert_one(json.loads(statistics_json))
def load_dataset_from_mongodb( task: str = 'duplicity', database_name: str = 'bugzilla', collection_name: str = 'normalized_clear', query_limit: int = 0 ) -> pd.DataFrame: log.info(f"Reading data from MongoDB: '{database_name}.{collection_name}'") tic = time.time() # MongoClient connection. mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else MongoClient(host='localhost', port=27017) db = mongodb_client[database_name] col = db[collection_name] if col.name not in db.list_collection_names(): raise errors.CollectionInvalid(f"Collection '{db.name}.{col.name}' not found. " f"Make sure your collection name is correct.") # Queries MongoDB collection. query = get_task_dataset_query(task) log.info(f"Query filter document: {query}") projection = get_task_dataset_projection(task) log.info(f"Projection document: {projection}") mongodb_data = col.find(query, projection).limit(query_limit) # Expands cursor and builds DataFrame. df = pd.DataFrame(list(mongodb_data)) log.info(f"Read documents from MongoDB: {len(df.index)}") log.info(f"Reading data from MongoDB total time: {(time.time() - tic) / 60} minutes") return df
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() order = 'ascending' if input_params['order'] == 1 else 'descending' log.info( f"Creating {order} index on field: '{input_params['db_name']}.{input_params['collection_name']}." f"{input_params['field_name']}' ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['db_name']] col = db[input_params['collection_name']] col.create_index([ (input_params['field_name'], DESCENDING if input_params['order'] == -1 else ASCENDING) ]) final_time = time.time() log.info( f"Creating {order} index on field: '{input_params['field_name']}' total execution time = " f"{((final_time - initial_time) / 60)} minutes")
def encode_and_save_assigned_to(dataset, corpus, n_developers): df = pd.DataFrame(columns=['assigned_to']) column_value_counts = dataset['label'].value_counts() df['assigned_to'] = column_value_counts.keys().to_list() # Assigning numerical values and storing in another column df[f"assigned_to_code"] = df['assigned_to'].index log.info(f"Assigned to codes: ") log.info(df[:n_developers]) # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else MongoClient(host='localhost', port=27017) db = mongodb_client[corpus] col = db[f"assigned_to_codes"] if col.name in db.list_collection_names(): db.drop_collection(col.name) log.info(f"Inserting documents ...") inserted_documents = col.insert_many(df.to_dict("records")) log.info(f"Inserted documents: {len(inserted_documents.inserted_ids)}")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Incializa las variables que almacenarán los argumentos de entrada. input_params = get_input_params() # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params.db_name] source_collection = db[input_params.collection_name] target_collection = db[f"{input_params.collection_name}_embeddings"] log.debug( f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}" ) if target_collection.name in db.list_collection_names( ) and input_params.drop_collection: db.drop_collection(target_collection.name) cursor = source_collection.find({}, { "creation_ts": "$creation_time", "short_desc": "$summary", "bug_status": "$status", "bug_id": "$id", "dup_id": "$dupe_of", "resolution": 1, "version": 1, "product": 1, "priority": 1, "component": 1, "delta_ts": 1, "bug_severity": "$severity", "description": "$comments.0", "normalized_short_desc": 1, "normalized_description": 1, "comments": { "$slice": 1 } }) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Incializa las variables que almacenarán los argumentos de entrada. input_params = get_input_params() # Establece los parámetros que se enviarán en la petición. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['mongo_params'].db_name] col = db[ f"{input_params['mongo_params'].collection_name}" f"_{input_params['bz_api_params'].year}_{input_params['bz_api_params'].year + 1}"] log.debug( f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}" ) if col.name in db.list_collection_names( ) and input_params['mongo_params'].drop_collection: db.drop_collection(col.name) # Para cada año recupera las incidencias utilizando la API de Bugzilla. max_year = input_params['bz_api_params'].year for month in range(input_params['bz_api_params'].start_month, input_params['bz_api_params'].end_month + 1): max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 bugs = get_bugzilla_bugs_by_date_range( project=input_params['bz_api_params'].project, min_creation_ts= f"{input_params['bz_api_params'].year}-{str(month).zfill(2)}-01", max_creation_ts=f"{max_year}-{str(max_month).zfill(2)}-01", max_results=input_params['bz_api_params'].query_limit, include_fields=input_params['bz_api_params'].include_fields, get_comments=input_params['bz_api_params'].get_comments) save_issues_to_mongodb(mongodb_collection=col, issues=bugs) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] eclipse_base_col = db["eclipse_base"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = {"_id": 0} eclipse_base_data = eclipse_base_col.find({}, fields) # .limit(20) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(eclipse_base_data)) eclipse_base_summary_col = db["clear_description_nlp_2"] eclipse_base_summary_data = eclipse_base_summary_col.find({}, fields) eclipse_base_summary = pd.DataFrame(list(eclipse_base_summary_data)) eclipse_base_data_description_col = db["clear_short_desc_nlp"] eclipse_base_data_description_data = eclipse_base_data_description_col.find( {}, fields) eclipse_base_data_description = pd.DataFrame( list(eclipse_base_data_description_data)) # Merge de los dataframes eclipse_base_summary_description = pd.merge(eclipse_base_summary, eclipse_base_data_description, on='id') eclipse_base_clear = pd.merge(eclipse_base, eclipse_base_summary_description, on='id') # Almacena el dataframe en MongoDB. db["eclipse_base_clear"].insert_many(eclipse_base_clear.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def _db_load(self) -> dict: """Load the task from the db""" log.info( f"Loading data (task_id: '{self.task_id}') from '{self.save_dbname}.{self.save_collection}' ..." ) initial_time = time.time() client: MongoClient = get_default_mongo_client() task = client[self.save_dbname][self.save_collection].find_one( {'task_id': self.task_id}, {'_id': 0}) final_time = time.time() log.info( f"Loading data from MongoDB total time: {((final_time - initial_time) / 60)} minutes" ) return task
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] gerrit_collections = db.list_collection_names() log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) for project in os.environ["GERRIT_PROJECT_NAME"].split(","): df_list = [] for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']), int(os.environ['GERRIT_LAST_CREATION_YEAR'])): col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"] if col.name in gerrit_collections: tic = time.time() log.info(f"Retrieving Gerrit issues for year '{year}' ...") data = col.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) log.info(f"Gerrit issues for year '{year}': {df.shape[0]}") df_list.append(df) log.info( f"Retrieving Gerrit issues for year '{year}' execution time = {((time.time() - tic) / 60)} minutes" ) df_concatenated = pd.concat(df_list) table_dict = df_concatenated.to_dict("records") if project.lower() in gerrit_collections: db.drop_collection(project.lower()) db[project.lower()].insert_many(table_dict) final_time = time.time() log.info( f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def task_hyper_fit_0(tasks, corpus, date_range_train, date_range_test, common_kwargs): """ Fit the hyperparameters of a set of run tasks (from the task module) Args: tasks (dict): A mapping from task names to its relevant metric corpus (str): The corpus used to select the tasks to take into account. date_range_train (2-tuple of 3-tuple of int): Training date range to take into account. date_range_test (2-tuple of 3-tuple of int): Test date range to take into account. common_kwargs (list of list of str): List of hyperparameters to adjust in common. Returns: - Tuple of 2-tuple: Pairs of parameters (with their names flattned) and their values. - dict of str to (dict, float): A mapping from task names to their configuration and the score. """ client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else get_default_local_mongo_client() task_names = list(tasks.keys()) data = list(client["incidences"]["tasks"].find({ 'type': { "$in": task_names }, 'kwargs.corpus': corpus, 'kwargs.date_range_train': date_range_train, 'kwargs.date_range_test': date_range_test, })) results = [[(d["kwargs"], d["results"][tasks[d["type"]]]) for d in data if d["type"] == task_type] for task_type in task_names] common_rank = rank_common_parameters(results, common_kwargs) best = common_rank[0][0] return best, { task: rank_specific_parameters(result, best)[0] for result, task in zip(results, tasks) }
def get_constituency_tree_raw_data(db_name, col_name, categorical=True): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger log.debug(f"\n[INICIO EJECUCIÓN]") env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else get_default_local_mongo_client() db = mongodb_client[db_name] col = db[col_name] if not categorical: query, fields = get_raw_data_query_and_projection() else: query, fields = get_raw_data_query_and_projection_categorical() return col.find(query, fields)
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_base"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, # "assigned_to": 1, # "assigned_to_detail": 1, # "classification": 1, # "component": 1, # "creation_time": 1, # "creator": 1, # "creator_detail": 1, # "dupe_of": 1, "id": 1, # "op_sys": 1, # "platform": 1, # "priority": 1, # "product": 1, # "resolution": 1, # "severity": 1, # "status": 1, "summary": 1 # "version": 1, # "description": 1 } data = col.find({}, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear_nlp = pd.DataFrame(list(data)) print(clear_nlp) nltk.download("stopwords", quiet=True) clear_nlp["summary_split_alpha"] = clear_nlp["summary"].apply( lambda x: clean_doc_split(x)) clear_nlp["summary_lower"] = clear_nlp["summary_split_alpha"].apply( lambda x: clean_doc_lower(x)) clear_nlp["summary_punctuaction"] = clear_nlp["summary_lower"].apply( lambda x: clean_doc_punctuaction(x)) clear_nlp["summary_trim"] = clear_nlp["summary_punctuaction"].apply( lambda x: clean_doc_trim(x)) clear_nlp["summary_isalpha"] = clear_nlp["summary_trim"].apply( lambda x: clean_doc_isalpha(x)) clear_nlp["summary_stop_words"] = clear_nlp["summary_isalpha"].apply( lambda x: clean_doc_stopW(x)) clear_nlp["summary_diacritic"] = clear_nlp["summary_stop_words"].apply( lambda x: clean_doc_diacri(x)) clear_nlp["summary_lemmatizer"] = clear_nlp["summary_diacritic"].apply( lambda x: clean_doc_lem(x)) # Almacena el dataframe en MongoDB. db["eclipse_base_summary_clear"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
labels_value_counts = df_labels['label'].value_counts() log.info(f"Number of distinct label values: {labels_value_counts.shape[0]}") df_distinct_labels = pd.DataFrame( data=labels_value_counts.keys().to_list(), columns=['label'] ) # converting type of label column to 'category' df_distinct_labels['label'] = df_distinct_labels['label'].astype('category') # Assigning numerical values and storing in another column df_distinct_labels['label_code'] = df_distinct_labels['label'].cat.codes # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \ else MongoClient(host='localhost', port=27017) db = mongodb_client[input_params['corpus']] col = db[f"{input_params['task']}_task_labels"] log.debug(f"Existent collections in '{db.name}': {str(db.list_collection_names())}") if col.name in db.list_collection_names(): db.drop_collection(col.name) log.info(f"Inserting documents ...") inserted_documents = col.insert_many(df_distinct_labels.to_dict("records")) log.info(f"Inserted documents: {len(inserted_documents.inserted_ids)}")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding similar issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] collection = db[input_params['collection_name']] data = collection.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) # Check empty Dataframe. if 0 == df.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{collection.name}' collection.") similar_column_name = 'sim_bugs' if not input_params[ 'near_issues'] else 'near_bugs' # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues. pairs = generate_pairs(df, similar_column_name) log.info(f"Pairs generated: {len(pairs)}") # Dataframe pairs. df_pairs = pd.DataFrame(pairs) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size']) batches = np.array_split(df_pairs, num_batches) output_db = mongodb_client[input_params['output_db_name']] output_collection = output_db[input_params['output_similar_collection_name']] if not input_params['near_issues'] \ else output_db[input_params['output_near_collection_name']] # Drop collection if already exists. if output_collection.name in output_db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding similar issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] collection = db[input_params['collection_name']] data = collection.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) # Check empty Dataframe. if 0 == df.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{collection.name}' collection.") # Initialize empty list column. df['sim_bugs'] = [[] for i in range(len(df))] df['near_bugs'] = [[] for i in range(len(df))] # Iterate over all rows. all_sim_bugs = [] all_near_bugs = [] for i in tqdm(range(df.shape[0])): # Compare each row with all rows. for j in range(df.shape[0]): if j == i: continue bug_anc = df.at[i, 'bug_id'] bug_pos = df.loc[j, 'bug_id'] jaccard_similarity = jaccard_score(df.loc[i, 'file_list'], df.loc[j, 'file_list']) if jaccard_similarity >= float( input_params['similarity_threshold']): if [bug_anc, bug_pos] not in all_sim_bugs and [ bug_pos, bug_anc ] not in all_sim_bugs: df.at[i, 'sim_bugs'].append(int(bug_pos)) all_sim_bugs.append([bug_anc, bug_pos]) if float(input_params['similarity_threshold']) - 0.25 <= jaccard_similarity < \ float(input_params['similarity_threshold']): if [bug_anc, bug_pos] not in all_near_bugs and [ bug_pos, bug_anc ] not in all_near_bugs: df.at[i, 'near_bugs'].append(int(bug_pos)) all_near_bugs.append([bug_anc, bug_pos]) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df.shape[0] / input_params['batch_size']) batches = np.array_split(df, num_batches) output_collection = db[input_params['output_collection_name']] # Drop collection if already exists. if output_collection.name in db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Processing Gerrit issues ...") # MongoDB data. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] col = db[input_params['collection_name']] data = col.find({}, {'_id': 0}) df = pd.DataFrame(list(data)) df['bug_id'] = -1 # Search for bugs that don't matches this pattern. bug_id_list = [] loop = tqdm(range(df.shape[0])) for i in loop: matched_1 = re.search(r'\[\s*\+?(-?\d+)\s*]', df.loc[i, 'subject']) matched_2 = re.search(r'[Bb][Uu][Gg]\s[0-9]+', df.loc[i, 'subject']) is_match = bool(matched_1) + bool(matched_2) if bool(is_match): flag = True res = re.findall(r'[Bb][Uu][Gg]\s[0-9]+', df.iloc[i]['subject']) if not res: flag = False res = re.findall(r'\[\s*\+?(-?\d+)\s*]', df.iloc[i]['subject']) bug_id = int(res[0][4:]) if flag else int(res[0]) if bug_id in bug_id_list: idx = df.index[df['bug_id'] == bug_id].tolist() if len(idx) > 1: raise ValueError( f"There are more than one bug with 'bug_id' = {bug_id}" ) previous_file_list = get_filtered_file_list( df.iloc[idx[0]]['file_list']) current_file_list = get_filtered_file_list( df.iloc[i]['file_list']) file_list = set(previous_file_list + current_file_list) df.at[idx[0], 'file_list'] = list(file_list) else: bug_id_list.append(bug_id) df.at[i, 'bug_id'] = bug_id df.at[i, 'file_list'] = get_filtered_file_list( df.iloc[i]["file_list"]) # Drop collection if already exists. if input_params['output_collection_name'] in db.list_collection_names(): log.info( f"Dropping collection {input_params['output_collection_name']} ..." ) db.drop_collection(input_params['output_collection_name']) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df.shape[0] / input_params['batch_size']) batches = np.array_split(df.loc[df['bug_id'] != -1], num_batches) # Insert documents with bug_id in MongoDB. inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = db[ input_params['output_collection_name']].insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) final_time = time.time() log.info( f"Processing Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["eclipse"] col = db["clear"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "bug_id": 1, # "product": 1, # "description": 1, # "bug_severity": 1, # "dup_id": 1, "short_desc": 1, # "priority": 1, # "version": 1, # "component": 1, # "delta_ts": 1, "bug_status": 1 # "creation_ts": 1, # "resolution": 1 } # Se utilizarán sólo las incidencias resueltas. query = { '$or': [{ 'bug_status': { '$eq': 'RESOLVED' } }, { 'bug_status': { '$eq': 'VERIFIED' } }] } data = col.find(query, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear_nlp = pd.DataFrame(list(data)) print(clear_nlp) nltk.download("stopwords", quiet=True) clear_nlp["short_desc_split_alpha"] = clear_nlp["short_desc"].apply( lambda x: clean_doc_split(x)) clear_nlp["short_desc_lower"] = clear_nlp["short_desc_split_alpha"].apply( lambda x: clean_doc_lower(x)) clear_nlp["short_desc_punctuaction"] = clear_nlp["short_desc_lower"].apply( lambda x: clean_doc_punctuaction(x)) clear_nlp["short_desc_trim"] = clear_nlp["short_desc_punctuaction"].apply( lambda x: clean_doc_trim(x)) clear_nlp["short_desc_isalpha"] = clear_nlp["short_desc_trim"].apply( lambda x: clean_doc_isalpha(x)) clear_nlp["short_desc_stop_words"] = clear_nlp["short_desc_isalpha"].apply( lambda x: clean_doc_stopW(x)) clear_nlp["short_desc_diacritic"] = clear_nlp[ "short_desc_stop_words"].apply(lambda x: clean_doc_diacri(x)) clear_nlp["short_desc_lemmatizer"] = clear_nlp[ "short_desc_diacritic"].apply(lambda x: clean_doc_lem(x)) # Elimina la columna 'bug_status' porque se realizará después un merge con la colección original. clear_nlp.drop('bug_status', axis=1, inplace=True) # Almacena el dataframe en MongoDB. db["clear_short_desc_nlp"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def get_mongo_client(environment): if environment == 'local': return MongoClient(host='localhost', port=27017) else: return get_default_mongo_client()
if __name__ == "__main__": # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) log.info(f"Updating all normalized_clear years ...") # Load the parameters. input_params = get_input_params() assert input_params is not None, f"No params provided." # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['corpus']] col_name = 'normalized_clear_updated' if input_params[ 'closed_states'] else 'normalized_clear_all_states' col = db[col_name] if input_params[ 'drop_collection'] and col.name in db.list_collection_names(): log.info(f"Dropping collection '{db.name}.{col.name}'") db.drop_collection(col.name) # Defines Python executable. for year in range(2000, 2021): cmd = get_command(os.name, input_params['corpus'], year, input_params['closed_states'])
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_all"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "assigned_to": 1, "assigned_to_detail": 1, "classification": 1, "component": 1, "creation_time": 1, "creator": 1, "creator_detail": 1, "dupe_of": 1, "id": 1, "op_sys": 1, "platform": 1, "priority": 1, "product": 1, "resolution": 1, "severity": 1, "status": 1, "summary": 1, "version": 1, "description": "$comments.text" } aggregation_project = {"$project": fields} # Se utilizarán sólo las incidencias resueltas. aggregation_match = {"$match": { '$and': [ { '$or': [ {'status': {'$eq': 'RESOLVED'}}, {'status': {'$eq': 'VERIFIED'}}, {'status': {'$eq': 'CLOSED'}} ] }, {'comments': {'$exists': 'true'}}, {'comments': {'$ne': 'null'}}, {'comments': {'$ne': ""}}, {'comments': {'$not': {'$size': 0}}}, {'comments.count': {'$eq': 0}} ] }} aggregation_unwind = {"$unwind": "$comments"} aggregation_limit = {"$limit": 20} # data = col.find(query, fields).limit(20) data = col.aggregate([ aggregation_unwind, aggregation_match, # aggregation_limit, aggregation_project ]) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(data)) # Almacena el dataframe en MongoDB. db["eclipse_base"].insert_many(eclipse_base.to_dict('records')) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["bugzilla"] col = db["eclipse_all"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, "assigned_to": 1, "assigned_to_detail": 1, "classification": 1, "component": 1, "creation_time": 1, "creator": 1, "creator_detail": 1, "dupe_of": 1, "id": 1, "op_sys": 1, "platform": 1, "priority": 1, "product": 1, "resolution": 1, "severity": 1, "status": 1, "summary": 1, "version": 1, "description": "$comments.text" } aggregation_project = {"$project": fields} # Se utilizarán sólo las incidencias resueltas. aggregation_match = {"$match": { '$and': [ { '$or': [ {'status': {'$eq': 'RESOLVED'}}, {'status': {'$eq': 'VERIFIED'}}, {'status': {'$eq': 'CLOSED'}} ] }, {'comments.count': {'$eq': 0}} ] }} aggregation_unwind = {"$unwind": "$comments"} aggregation_limit = {"$limit": 20} # data = col.find(query, fields).limit(20) data = col.aggregate([ aggregation_unwind, aggregation_match, aggregation_limit, aggregation_project ]) # Expande el cursor y construye el DataFrame eclipse_base = pd.DataFrame(list(data)) print(eclipse_base.head(10)) # 1º-Convertir la variable en tipo categórica: eclipse_base.priority = eclipse_base.priority.astype('category') print(eclipse_base.head(10)) # 2º-Catergorizar: eclipse_base['priority_cod'] = eclipse_base['priority'].cat.codes print(eclipse_base.head(10)) # 1- Primero creas una instancia: # le = sklearn.preprocessing.LabelEncoder() # 2- Después ajustas a tus datos: # le.fit(labels) (en este caso, "labels = data[:,0]" era la columna con las predicciones). # 3- Obtienes la columna con los valores transformados: # labels = le.transform(labels) # 4- Puedes guardar los valores originales para establecer una relación: # class_names = le.classes_ # df["creation_time_year"] = df["creation_time"].str[:4] # df["last_change_time_year"] = df["last_change_time"].str[:4] # df["resolution_string"] = df["resolution"].apply(lambda y: "EMPTY_FIELD" if len(y) == 0 else y) final_time = time.time() log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Finding duplicate issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Bugzilla data. db = mongodb_client[input_params['input_db_name']] # Collection all issues (duplicated and no duplicated). original_collection = db[input_params['original_collection_name']] query = {} projection = {'_id': 0, 'bug_id': 1} log.info(f"Reading data from '{db.name}.{original_collection.name}' ...") original_data = original_collection.find(query, projection) df_original = pd.DataFrame(list(original_data)) # Check empty Dataframe. if 0 == df_original.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{db.name}.{original_collection.name}' collection.") # List with all bug_id. original_bug_id_list = df_original['bug_id'].to_list() collection_pairs_first_step = db[input_params['input_collection_name']] query = {} projection = {'_id': 0} data_pairs_first_step = collection_pairs_first_step.find(query, projection) df_pairs_first_step = pd.DataFrame(list(data_pairs_first_step)) # Check empty Dataframe. if 0 == df_pairs_first_step.shape[0]: raise ValueError( f"No documents have been retrieved from " f"'{db.name}.{collection_pairs_first_step.name}' collection.") bug1_list = df_pairs_first_step['bug1'].to_list() bug2_list = df_pairs_first_step['bug2'].to_list() no_duplicate_bug_id_list = set(original_bug_id_list) - set( bug1_list) - set(bug2_list) # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues. pairs_from_indirect_relations = check_indirect_relations( df_pairs_first_step, list(no_duplicate_bug_id_list)) log.info(f"Pairs generated: {len(pairs_from_indirect_relations)}") # Dataframe pairs. df_pairs = pd.DataFrame(pairs_from_indirect_relations) # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size']) batches = np.array_split(df_pairs, num_batches) output_collection = db[input_params['output_collection_name']] # Drop collection if already exists. if output_collection.name in db.list_collection_names(): log.info( f"Dropping collection '{db.name}.{output_collection.name}' ...") db.drop_collection(output_collection.name) inserted_docs_number = 0 for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many( batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") final_time = time.time() log.info( f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes" )
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Define el logger que se utilizará. logger = log4p.GetLogger(__name__) log = logger.logger # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Carga el fichero de configuración para el entorno. env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env') load_dotenv(dotenv_path=env_path) # Inicializa los parámetros MongoDB para almacenar las estadísticas. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client["eclipse"] clear_col = db["clear"] # Campos seleccionados para los trabajos abordados en el proyecto de investigación. fields = { "_id": 0, } # Se utilizarán sólo las incidencias resueltas. query = { '$or': [{ 'bug_status': { '$eq': 'RESOLVED' } }, { 'bug_status': { '$eq': 'VERIFIED' } }] } clear_data = clear_col.find(query, fields) # .limit(20) # Expande el cursor y construye el DataFrame clear = pd.DataFrame(list(clear_data)) clear_short_desc_nlp_col = db["clear_short_desc_nlp"] clear_short_desc_nlp_data = clear_short_desc_nlp_col.find({}, fields) clear_short_desc_nlp = pd.DataFrame(list(clear_short_desc_nlp_data)) clear_description_nlp_col = db["clear_description_nlp_2"] clear_description_nlp_data = clear_description_nlp_col.find({}, fields) clear_description_nlp = pd.DataFrame(list(clear_description_nlp_data)) # Merge de los dataframes clear_short_desc_description_nlp = pd.merge(clear_short_desc_nlp, clear_description_nlp, on='bug_id') clear_nlp = pd.merge(clear, clear_short_desc_description_nlp, on='bug_id') # Almacena el dataframe en MongoDB. db["clear_nlp"].insert_many(clear_nlp.to_dict('records')) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutos")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['db_name'])] year = input_params['year'] col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"] log.debug( f"Existent collections in '{db.name}': {str(db.list_collection_names())}" ) if col.name in db.list_collection_names(): db.drop_collection(col.name) # Get Gerrit Issues by month. max_year = year for month in range(input_params['start_month'], input_params['end_month'] + 1): max_month = month + 1 if max_month > 12: max_month = 1 max_year += 1 after = str(datetime.datetime(input_params['year'], month, 1)) before = str(datetime.datetime(max_year, max_month, 1)) s = 0 change_id, project, status, date, subject, author, committer, commit_msg = [], [], [], [], [], [], [], [] file_list = [] skip = '' flag = True log.info(f"Retrieving issues in range: {after} - {before}") while flag: issues_by_month = get_data_from_gerrit_rest_api( os.environ['GERRIT_API_URL'], after, before, skip=skip) log.info(f"Read issues: {len(issues_by_month)}") if len(issues_by_month) < 1: print('No issues for month {} and year {}'.format(month, year)) flag = False else: for issue in issues_by_month: change_id.append(issue['change_id']) project.append(issue['project']) status.append(issue['status']) date.append(issue['updated']) subject.append(issue['subject']) current_rev = issue['current_revision'] author.append(issue['revisions'][current_rev]['commit'] ['author']['email']) committer.append(issue['revisions'][current_rev]['commit'] ['committer']['email']) commit_msg.append( issue['revisions'][current_rev]['commit']['message']) file_list.append( list(issue['revisions'][current_rev]['files'].keys())) if len(issues_by_month) == 100: last_dict = issues_by_month[99] try: if last_dict['_more_changes']: log.info(f"There are more issues to read") s = s + 100 skip = '&S=' + str(s) except KeyError: flag = False print('Issues for month {} and year {}: {}'.format( month, year, len(change_id))) else: print('Issues for month {} and year {}: {}'.format( month, year, len(change_id))) flag = False dict_of_reg = { 'change_id': change_id, 'project': project, 'status': status, 'date': date, 'subject': subject, 'author': author, 'committer': committer, 'commit_msg': commit_msg, 'file_list': file_list } df = pd.DataFrame(dict_of_reg) issues = df.to_dict("records") save_issues_to_mongodb(mongodb_collection=col, issues=issues) final_time = time.time() log.info( f"Total execution time = {((final_time - initial_time) / 60)} minutes")
def main(): # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load parameters. input_params = get_input_params() log.info(f"Merging Gerrit and Bugzilla issues ...") # MongoDB client. mongodb_client: MongoClient = get_default_mongo_client() # Load Gerrit data. log.info(f"Loading Gerrit issues ...") tic = time.time() gerrit_db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['gerrit_db_name'])] gerrit_collection = gerrit_db[input_params['gerrit_collection_name']] gerrit_data = gerrit_collection.find({}, {'_id': 0}) df_gerrit = pd.DataFrame(list(gerrit_data)) log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.") # Check empty Dataframe. if 0 == df_gerrit.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{gerrit_db.name}.{gerrit_collection.name}' collection.") # Load Bugzilla data. log.info(f"Loading Bugzilla issues ...") tic = time.time() bugzilla_db = mongodb_client[os.environ.get('BUGZILLA_MONGODB_DATABASE_NAME', input_params['bugzilla_db_name'])] bugzilla_collection = bugzilla_db[input_params['bugzilla_collection_name']] bugzilla_data = bugzilla_collection.find({}, {'_id': 0, 'bug_id': 1}) df_bugzilla = pd.DataFrame(list(bugzilla_data)) log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.") # Check empty Dataframe. if 0 == df_bugzilla.shape[0]: raise ValueError(f"No documents have been retrieved from " f"'{bugzilla_db.name}.{bugzilla_collection.name}' collection.") # Join on column 'bug_id'. log.info(f"Joining Gerrit and Bugzilla Dataframes ...") tic = time.time() df_joined = df_gerrit.merge(df_bugzilla, left_on='bug_id', right_on='bug_id') log.info(f"Joining Gerrit and Bugzilla Dataframes total time: {(time.time() - tic) / 60} minutes.") # Split dataframe in batches to avoid pymongo.errors.CursorNotFound. num_batches = math.ceil(df_joined.shape[0] / input_params['batch_size']) batches = np.array_split(df_joined, num_batches) output_collection = gerrit_db[input_params['output_collection_name']] # Drop collection if already exists. if input_params['output_collection_name'] in gerrit_db.list_collection_names(): log.info(f"Dropping collection {input_params['output_collection_name']} ...") gerrit_db.drop_collection(input_params['output_collection_name']) inserted_docs_number = 0 tic = time.time() for batch in batches: log.info(f"Inserting documents ...") inserted_documents = output_collection.insert_many(batch.to_dict("records")) inserted_docs_number += len(inserted_documents.inserted_ids) log.info(f"Inserted documents: {inserted_docs_number}") log.info(f"Inserting documents total time: {(time.time() - tic) / 60} minutes.") final_time = time.time() log.info(f"Merging Gerrit and Bugzilla issues total execution time = {((final_time - initial_time) / 60)} minutes")