def build_similarity_dataset( dataset: pd.DataFrame = None, corpus: str = 'bugzilla', collection_name: str = 'similar_pairs' ) -> pd.DataFrame: # Load df_similar_pairs dataframe. df_similar_pairs = load_dataframe_from_mongodb( database_name=corpus, collection_name=collection_name ) # Change bug_id column type. dataset['bug_id'] = pd.to_numeric(dataset['bug_id']) # Join on column bug1 and bug_id. df_pairs_bug1_dataset_bug_id = df_similar_pairs.merge(dataset, left_on='bug1', right_on='bug_id') # Join on column bug2 and bug_id. result = df_pairs_bug1_dataset_bug_id.merge(dataset, left_on='bug2', right_on='bug_id', suffixes=('_left', '_right')) result.drop(['bug1', 'bug2'], axis=1, inplace=True) # Rename column dec as label. result.rename(columns={"dec": "label"}, errors="raise", inplace=True) return result
def read_dataset_from_mongodb( database_name: str = 'bugzilla', collection_name: str = 'duplicity_task_train_dataset', query: dict = None, projection: dict = None, trees_columns: list = None, attention_vectors_columns: list = None, structured_data_columns: list = None, query_limit: int = 0) -> List[Tuple[Union[list, Any], ...]]: # List[Tuple[list, Any, Any]] # Read MongoDB collection. df = load_dataframe_from_mongodb(database_name=database_name, collection_name=collection_name, query=query, projection=projection, query_limit=query_limit) log.info(f"Generating trees ...") trees_columns = trees_columns if trees_columns is not None and len( trees_columns) > 0 else ['trees'] attention_vectors_columns = attention_vectors_columns \ if attention_vectors_columns is not None and len(attention_vectors_columns) > 0 else ['attention_vectors'] structured_data_columns = structured_data_columns \ if structured_data_columns is not None and len(structured_data_columns) > 0 else ['structured_data'] rows = [] loop = tqdm(range(df.shape[0]), desc='rows') for i in loop: row_elements = [] # Columns with trees. for column_name in trees_columns: row_elements.append(get_trees_from_mongodb(df.at[i, column_name])) # Columns with attention vectors. for column_name in attention_vectors_columns: row_elements.append(df.at[i, column_name]) # Columns with structured data vectors. for column_name in structured_data_columns: row_elements.append(df.at[i, column_name]) # Column with label. row_elements.append(df.at[i, 'label']) rows.append(tuple(row_elements)) return rows
def get_similarity_dataset( dataset: pd.DataFrame = None, corpus: str = '', near_issues: bool = False, query_limit: int = 0 ) -> pd.DataFrame: collection_name = 'similar_pairs' if not near_issues else 'near_pairs' df_similar_pairs = load_dataframe_from_mongodb( database_name=corpus, collection_name=collection_name ) # Sort by creation_ts df = df_similar_pairs.sort_values('creation_ts') if query_limit > 0: df = df.head(query_limit).copy() dataset_merged = df.merge(dataset, left_on='bug_id', right_on='bug_id', suffixes=('_left', '_right')) return dataset_merged
def get_best_models(self, num_models=1): query = self._get_experiments_query() projection = self._get_best_models_projection() # All results are retrieved to limit their number after sorting. df = load_dataframe_from_mongodb(database_name=self.save_dbname, collection_name=self.save_collection, query=query, projection=projection, query_limit=0) if df.empty: log.error( f"No metrics saved in '{self.save_dbname}.{self.save_collection}' for task '{self.task}' " f"and corpus '{self.corpus}'.") raise ValueError( 'Ensure database name, collection name, task name and corpus are correct.' ) # Explode MongoDB fields. df_model_meta_file = df['model_meta_file'] df_metrics = pd.json_normalize(df['metrics']) exploded_df = pd.concat( [df['task_id'], df_model_meta_file, df_metrics], axis=1) # Check if metric name exists. df_metrics_columns = df_metrics.columns self._metric_name_exits(self.objective, list(df_metrics_columns)) # Sort by exploded_df = exploded_df.sort_values(by=self.objective, ascending=False) # Query limit if num_models > 0: exploded_df = exploded_df.head(num_models) # Return id and model_meta_file. exploded_df.drop(list(df_metrics_columns), axis=1, inplace=True) return exploded_df.to_dict('records')
def get_pairs_dataset( dataset: pd.DataFrame = None, task: str = '', corpus: str = '', query_limit: int = 0 ) -> pd.DataFrame: projection = get_task_dataset_projection(task) # Query only non rejected documents query = {'rejected': False} df_task_dataset = load_dataframe_from_mongodb( database_name=corpus, collection_name=f"normalized_clear", query=query, projection=projection, sort_by='creation_ts', query_limit=query_limit ) df_task_dataset['bug_id'] = pd.to_numeric(df_task_dataset['bug_id']) dataset_merged = dataset.merge(df_task_dataset, how='cross', suffixes=('_left', '_right')) return dataset_merged
def build_duplicity_dataset(dataset: pd.DataFrame = None, corpus: str = 'bugzilla') -> pd.DataFrame: # Load pairs dataframe. df_pairs = load_dataframe_from_mongodb( database_name=corpus, collection_name='pairs' ) # Check duplicated pairs. log.info(f"Looking for duplicates in pair 'bug1' - 'bug2' ...") df_pairs['bug1-bug2'] = df_pairs.apply(lambda x: f"{x['bug1']}-{x['bug2']}", axis=1) df_pairs['bug2-bug1'] = df_pairs.apply(lambda x: f"{x['bug2']}-{x['bug1']}", axis=1) log.info(f"Rows before drop duplicates: {df_pairs.shape[0]}") df_pairs.drop_duplicates(subset='bug1-bug2', keep=False, inplace=True) log.info(f"Rows after drop duplicates: {df_pairs.shape[0]}") log.info(f"Looking for duplicates 'bug1' - 'bug2' equals to 'bug2' - 'bug1' ...") df_pairs['duplicated'] = df_pairs.apply(lambda x: x['bug1-bug2'] == x['bug2-bug1'], axis=1) log.info(f"Rows with duplicates pairs: {df_pairs[df_pairs['duplicated']].shape[0]}") df_pairs_final = df_pairs[df_pairs['duplicated'] == False].copy() log.info(f"Rows after drop all types of duplicates: {df_pairs_final.shape[0]}") # Change bug_id column type. dataset['bug_id'] = pd.to_numeric(dataset['bug_id']) # Join on column bug1 and bug_id. df_pairs_bug1_dataset_bug_id = df_pairs_final.merge(dataset, left_on='bug1', right_on='bug_id') # Join on column bug2 and bug_id. result = df_pairs_bug1_dataset_bug_id.merge(dataset, left_on='bug2', right_on='bug_id', suffixes=('_left', '_right')) result.drop(['bug1', 'bug2'], axis=1, inplace=True) # Rename column dec as label. result.rename(columns={"dec": "label"}, errors="raise", inplace=True) return result
log.info(f"Label column name for task '{input_params['task']}' and corpus '{input_params['corpus']}':" f" {label_column_name}") # Load normalized_clear collection. projection = {'_id': 0, 'label': f"${label_column_name}"} collection_name = get_label_collection_name(input_params['task'], input_params['corpus']) if 'similarity' == input_params['task']: if not input_params['near_issues']: collection_name = f"similar_{collection_name}" else: collection_name = f"near_{collection_name}" df_labels = load_dataframe_from_mongodb( database_name=input_params['corpus'], collection_name=collection_name, projection=projection ) # Group_by label column. labels_value_counts = df_labels['label'].value_counts() log.info(f"Number of distinct label values: {labels_value_counts.shape[0]}") df_distinct_labels = pd.DataFrame( data=labels_value_counts.keys().to_list(), columns=['label'] ) # converting type of label column to 'category' df_distinct_labels['label'] = df_distinct_labels['label'].astype('category')
'STRUCTURED_DATA_COLUMN_NAMES'].split(',') if len(structured_data_column_name) == 0: raise ValueError('No structured data column names defined.') log.info(f"Structured data column name: {structured_data_column_name}") # Load normalized_clear collection. projection = {'_id': 0} for column in structured_data_column_name: projection[column] = 1 log.info(f"projection:{projection}") df_structured_data = load_dataframe_from_mongodb( database_name=input_params['corpus'], collection_name=input_params['collection_name'], projection=projection) # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['corpus']] # Group_by structured data column. for column in structured_data_column_name: df = pd.DataFrame(columns=[column]) column_value_counts = df_structured_data[column].value_counts() log.info( f"Number of distinct values in column '{column}': {column_value_counts.shape[0]}" )
"creation_ts": { "$gte": datetime.datetime(input_params['year'], 1, 1).strftime('%Y-%m-%d'), "$lt": datetime.datetime(input_params['year'] + 1, 1, 1).strftime('%Y-%m-%d') } } if input_params['closed_states']: query['bug_status'] = {"$in": ["CLOSED", "RESOLVED", "VERIFIED"]} projection = {'_id': 0, 'bug_id': 1, 'assigned_to': 1} df_clear = load_dataframe_from_mongodb( database_name=input_params['corpus'], collection_name='clear', query=query, projection=projection) # Check empty Dataframe. if 0 == df_clear.shape[0]: raise ValueError( f"No documents have been retrieved from '{input_params['corpus']}.clear' collection for the " f"year {input_params['year']}") # Mongo client. mongodb_client: MongoClient = get_default_mongo_client() db = mongodb_client[input_params['corpus']] col_name = 'normalized_clear_updated' if input_params[ 'closed_states'] else 'normalized_clear_all_states'
def _aggregate_metrics(self, metric_name: list = None, sort_by: list = None, num_models: int = 0, fields: list = None) -> dict: """Aggregate the metrics of all experiments in MongoDB. Args: metric_name: (list) metrics names sort_by: (list) columns to sort by num_models: (int) limit of read results from MongoDB Example: _aggregate_metrics( ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro'], ['accuracy'], 1 ) """ query = self._get_experiments_query() projection = self._get_experiments_projection() # All results are retrieved to limit their number after sorting. df = load_dataframe_from_mongodb(database_name=self.save_dbname, collection_name=self.save_collection, query=query, projection=projection, query_limit=0) if df.empty: log.error( f"No metrics saved in '{self.save_dbname}.{self.save_collection}' for task '{self.task}' " f"and corpus '{self.corpus}'.") raise ValueError( 'Ensure database name, collection name, task name and corpus are correct.' ) # Explode MongoDB fields. df_dataset = pd.json_normalize(df['dataset']) df_scheduler = pd.json_normalize(df['scheduler']) df_model = pd.json_normalize(df['model']) df_metrics = pd.json_normalize(df['metrics']) exploded_df = pd.concat( [df['task_id'], df_dataset, df_scheduler, df_model, df_metrics], axis=1) # Sort by if sort_by is not None and len(sort_by) > 0: for col in sort_by: assert col in exploded_df.columns, log.error( f"Column '{col}' not exists in saved experiments.") exploded_df = exploded_df.sort_values(by=sort_by, ascending=False) # Query limit if num_models > 0: exploded_df = exploded_df.head(num_models) # Check if metric name exists. df_metrics_columns = df_metrics.columns self._metric_name_exits(metric_name, list(df_metrics_columns)) excluded_fields = [] if fields is not None: for column in exploded_df.columns: if column not in fields + metric_name: excluded_fields.append(column) exploded_df.drop(list(excluded_fields), axis=1, inplace=True) metric_name = None # Select fields and metrics exploded_df_columns = exploded_df.to_dict('records') result = self._get_summary_fields(exploded_df_columns, metric_name, list(df_metrics_columns)) return result
if __name__ == "__main__": # Stores the execution start time to calculate the time it takes for the module to execute. initial_time = time.time() # Check if there is a running process that contains the name of this module. check_same_python_module_already_running(os.path.split(__file__)) # Load the parameters. input_params = get_input_params() assert input_params is not None, f"No params provided." log.info(f"Building 'normalized_clear' collection ...") # Load clear collection. df_clear = load_dataframe_from_mongodb( database_name=input_params['db_name'], collection_name=input_params['collection_name']) # Check empty Dataframe. if 0 == df_clear.shape[0]: raise ValueError( f"No documents have been retrieved from " f"'{input_params['db_name']}.{input_params['collection_name']}' collection." ) df_normalized_clear = df_clear.copy() # Normalize short description. df_normalized_clear['normalized_short_desc'] = df_normalized_clear[ 'short_desc'].apply( lambda x: normalize_incidence(x, to_lower_case=True))