Python get_default_mongo_client 예제들, syn.helpers.mongodb.get_default_mongo_client Python 예제들

예제 #1

0

파일 보기

파일: task.py 프로젝트: mlazarodominguez/syn

    def _db_store(self):
        """Store the task in the db"""
        log.info(f"Storing data in MongoDB ...")
        initial_time = time.time()

        client: MongoClient = get_default_mongo_client()

        query = {'task_id': self.task_id}
        log.info(f"query: {query}")
        document = {
            '$set': {
                'task_id': self.task_id,
                'task_name': self.task_name,
                'task_action': self.task_action
            }
        }
        result = client[self.save_dbname][self.save_collection].update_one(
            query,  # Query parameter
            document,
            upsert=True  # Options
        )
        log.info(
            f"Matched document with 'task_id' equals to '{self.task_id}': {result.matched_count}"
        )
        log.info(
            f"Modified document with 'task_id' equals to '{self.task_id}': {result.modified_count}"
        )

        final_time = time.time()
        log.info(
            f"Storing data in MongoDB total time: {((final_time - initial_time) / 60)} minutes"
        )

예제 #2

0

파일 보기

파일: AttentionVectorizer.py 프로젝트: mlazarodominguez/syn

def get_attention_vector_raw_data(db_name,
                                  col_name,
                                  categorical=True,
                                  column=None):
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    log.debug(f"\n[INICIO EJECUCIÓN]")

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else get_default_local_mongo_client()

    db = mongodb_client[db_name]
    col = db[col_name]

    if not categorical:
        query, fields = get_raw_data_query_and_projection()
        fields["constituents_embeddings_description1"] = 1
        fields["constituents_embeddings_description2"] = 1
        query["constituents_embeddings_description1.0"] = {'$exists': True}
        query["constituents_embeddings_description2.0"] = {'$exists': True}
    else:
        query, fields = get_raw_data_query_and_projection_categorical(
            column=column)
        fields["constituents_embeddings"] = 1
        query["constituents_embeddings.0"] = {'$exists': True}

    return col.find(query, fields)

예제 #3

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ['JIRA_MONGODB_DATABASE_NAME']]
    log.debug(f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}")

    for project in os.environ["JIRA_PROJECT_NAME"].split(","):
        for year in range(2001, 2021):
            col = db[f"{project.lower()}_{year}_{year + 1}"]
            if col.name in db.list_collection_names():
                db.drop_collection(col.name)

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #4

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ['GERRIT_DB_NAME']]
    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )

    for project in os.environ["GERRIT_PROJECT_NAME"].split(","):
        for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']),
                          int(os.environ['GERRIT_LAST_CREATION_YEAR'])):
            col = db[f"{project.lower()}_{year}_{year + 1}"]
            if col.name in db.list_collection_names():
                log.info(f"Dropping collection {col.name} ...")
                db.drop_collection(col.name)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutes")

예제 #5

0

파일 보기

파일: CreateJiraStatisticsMongoDBCollection.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[os.environ["JIRA_MONGODB_DATABASE_NAME"]]
    col = db[os.environ["JIRA_STATISTICS_MONGODB_COLLECTION_NAME"]]

    # Inicializa el resultado.
    statistics_dict = {}

    for project in os.environ["JIRA_PROJECT_NAME"].split(","):
        statistics_dict["project"] = project
        total_bugs = 0
        statistics_json = {}
        for year in range(int(os.environ["JIRA_FIRST_CREATION_YEAR"]), datetime.datetime.now().year + 1):
            statistics_dict[year] = {}
            statistics_dict[year]["_total"] = 0
            total_bugs_year = 0
            # month_statistics_dict = {}
            for month in range(1, 13):
                max_year = year
                max_month = month + 1
                if max_month > 12:
                    max_month = 1
                    max_year += 1
                issues = get_issues_by_date_range(
                    project=project,
                    min_created_date=f"{year}-{str(month).zfill(2)}-01",
                    max_creation_date=f"{max_year}-{str(max_month).zfill(2)}-01",
                    max_results=-False,
                    fields="id"
                )
                # Número total de incidencias del mes analizado.
                # month_statistics_dict.append({"month": month, "count": len(bugs)})
                # month_statistics_dict.append({month: len(bugs)})
                statistics_dict[year][month] = len(issues)
                total_bugs_year += len(issues)
            # Número total de incidencias del año analizado.
            statistics_dict[year]["_total"] = total_bugs_year
            # Estadísticas mensuales.
            # statistics_dict[year]["bugs"] = month_statistics_dict
            log.info(json.dumps(statistics_dict))
            # Número total de incidencias del proyecto.
            total_bugs += total_bugs_year
        statistics_dict["_total"] = total_bugs
        # Almacena las estadísticas para el proyecto.
        statistics_json = json.dumps(statistics_dict)
        col.insert_one(json.loads(statistics_json))

예제 #6

0

파일 보기

파일: dataset.py 프로젝트: mlazarodominguez/syn

def load_dataset_from_mongodb(
        task: str = 'duplicity',
        database_name: str = 'bugzilla',
        collection_name: str = 'normalized_clear',
        query_limit: int = 0
) -> pd.DataFrame:
    log.info(f"Reading data from MongoDB: '{database_name}.{collection_name}'")

    tic = time.time()
    # MongoClient connection.
    mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else MongoClient(host='localhost', port=27017)

    db = mongodb_client[database_name]
    col = db[collection_name]

    if col.name not in db.list_collection_names():
        raise errors.CollectionInvalid(f"Collection '{db.name}.{col.name}' not found. "
                                       f"Make sure your collection name is correct.")

    # Queries MongoDB collection.
    query = get_task_dataset_query(task)
    log.info(f"Query filter document: {query}")
    projection = get_task_dataset_projection(task)
    log.info(f"Projection document: {projection}")
    mongodb_data = col.find(query, projection).limit(query_limit)
    # Expands cursor and builds DataFrame.
    df = pd.DataFrame(list(mongodb_data))
    log.info(f"Read documents from MongoDB: {len(df.index)}")
    log.info(f"Reading data from MongoDB total time: {(time.time() - tic) / 60} minutes")

    return df

예제 #7

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    order = 'ascending' if input_params['order'] == 1 else 'descending'
    log.info(
        f"Creating {order} index on field: '{input_params['db_name']}.{input_params['collection_name']}."
        f"{input_params['field_name']}' ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[input_params['db_name']]
    col = db[input_params['collection_name']]
    col.create_index([
        (input_params['field_name'],
         DESCENDING if input_params['order'] == -1 else ASCENDING)
    ])

    final_time = time.time()
    log.info(
        f"Creating {order} index on field: '{input_params['field_name']}' total execution time = "
        f"{((final_time - initial_time) / 60)} minutes")

예제 #8

0

파일 보기

파일: dataset.py 프로젝트: mlazarodominguez/syn

def encode_and_save_assigned_to(dataset, corpus, n_developers):
    df = pd.DataFrame(columns=['assigned_to'])
    column_value_counts = dataset['label'].value_counts()

    df['assigned_to'] = column_value_counts.keys().to_list()

    # Assigning numerical values and storing in another column
    df[f"assigned_to_code"] = df['assigned_to'].index

    log.info(f"Assigned to codes: ")
    log.info(df[:n_developers])

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else MongoClient(host='localhost', port=27017)

    db = mongodb_client[corpus]

    col = db[f"assigned_to_codes"]

    if col.name in db.list_collection_names():
        db.drop_collection(col.name)

    log.info(f"Inserting documents ...")

    inserted_documents = col.insert_many(df.to_dict("records"))
    log.info(f"Inserted documents: {len(inserted_documents.inserted_ids)}")

예제 #9

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Incializa las variables que almacenarán los argumentos de entrada.
    input_params = get_input_params()

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params.db_name]
    source_collection = db[input_params.collection_name]
    target_collection = db[f"{input_params.collection_name}_embeddings"]

    log.debug(
        f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}"
    )

    if target_collection.name in db.list_collection_names(
    ) and input_params.drop_collection:
        db.drop_collection(target_collection.name)

    cursor = source_collection.find({}, {
        "creation_ts": "$creation_time",
        "short_desc": "$summary",
        "bug_status": "$status",
        "bug_id": "$id",
        "dup_id": "$dupe_of",
        "resolution": 1,
        "version": 1,
        "product": 1,
        "priority": 1,
        "component": 1,
        "delta_ts": 1,
        "bug_severity": "$severity",
        "description": "$comments.0",
        "normalized_short_desc": 1,
        "normalized_description": 1,
        "comments": {
            "$slice": 1
        }
    })

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #10

0

파일 보기

파일: CreateBugzillaMongoDBCollection.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Incializa las variables que almacenarán los argumentos de entrada.
    input_params = get_input_params()

    # Establece los parámetros que se enviarán en la petición.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params['mongo_params'].db_name]
    col = db[
        f"{input_params['mongo_params'].collection_name}"
        f"_{input_params['bz_api_params'].year}_{input_params['bz_api_params'].year + 1}"]

    log.debug(
        f"Colecciones exitentes en '{db.name}': {str(db.list_collection_names())}"
    )

    if col.name in db.list_collection_names(
    ) and input_params['mongo_params'].drop_collection:
        db.drop_collection(col.name)

    # Para cada año recupera las incidencias utilizando la API de Bugzilla.
    max_year = input_params['bz_api_params'].year
    for month in range(input_params['bz_api_params'].start_month,
                       input_params['bz_api_params'].end_month + 1):
        max_month = month + 1
        if max_month > 12:
            max_month = 1
            max_year += 1
        bugs = get_bugzilla_bugs_by_date_range(
            project=input_params['bz_api_params'].project,
            min_creation_ts=
            f"{input_params['bz_api_params'].year}-{str(month).zfill(2)}-01",
            max_creation_ts=f"{max_year}-{str(max_month).zfill(2)}-01",
            max_results=input_params['bz_api_params'].query_limit,
            include_fields=input_params['bz_api_params'].include_fields,
            get_comments=input_params['bz_api_params'].get_comments)
        save_issues_to_mongodb(mongodb_collection=col, issues=bugs)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #11

0

파일 보기

파일: CreateMongoDBEclipseBaseClearCollection.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    eclipse_base_col = db["eclipse_base"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {"_id": 0}

    eclipse_base_data = eclipse_base_col.find({}, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(eclipse_base_data))

    eclipse_base_summary_col = db["clear_description_nlp_2"]
    eclipse_base_summary_data = eclipse_base_summary_col.find({}, fields)
    eclipse_base_summary = pd.DataFrame(list(eclipse_base_summary_data))

    eclipse_base_data_description_col = db["clear_short_desc_nlp"]
    eclipse_base_data_description_data = eclipse_base_data_description_col.find(
        {}, fields)
    eclipse_base_data_description = pd.DataFrame(
        list(eclipse_base_data_description_data))

    # Merge de los dataframes
    eclipse_base_summary_description = pd.merge(eclipse_base_summary,
                                                eclipse_base_data_description,
                                                on='id')
    eclipse_base_clear = pd.merge(eclipse_base,
                                  eclipse_base_summary_description,
                                  on='id')

    # Almacena el dataframe en MongoDB.
    db["eclipse_base_clear"].insert_many(eclipse_base_clear.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #12

0

파일 보기

파일: task.py 프로젝트: mlazarodominguez/syn

    def _db_load(self) -> dict:
        """Load the task from the db"""
        log.info(
            f"Loading data (task_id: '{self.task_id}') from '{self.save_dbname}.{self.save_collection}' ..."
        )
        initial_time = time.time()

        client: MongoClient = get_default_mongo_client()
        task = client[self.save_dbname][self.save_collection].find_one(
            {'task_id': self.task_id}, {'_id': 0})

        final_time = time.time()
        log.info(
            f"Loading data from MongoDB total time: {((final_time - initial_time) / 60)} minutes"
        )
        return task

예제 #13

0

파일 보기

파일: ConcatenateAllGerritMongoDBCollections.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]

    gerrit_collections = db.list_collection_names()
    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )

    for project in os.environ["GERRIT_PROJECT_NAME"].split(","):
        df_list = []
        for year in range(int(os.environ['GERRIT_FIRST_CREATION_YEAR']),
                          int(os.environ['GERRIT_LAST_CREATION_YEAR'])):
            col = db[f"{input_params['collection_name']}"
                     f"_{year}_{year + 1}"]
            if col.name in gerrit_collections:
                tic = time.time()
                log.info(f"Retrieving Gerrit issues for year '{year}' ...")
                data = col.find({}, {'_id': 0})
                df = pd.DataFrame(list(data))
                log.info(f"Gerrit issues for year '{year}': {df.shape[0]}")
                df_list.append(df)
                log.info(
                    f"Retrieving Gerrit issues for year '{year}' execution time = {((time.time() - tic) / 60)} minutes"
                )

        df_concatenated = pd.concat(df_list)
        table_dict = df_concatenated.to_dict("records")
        if project.lower() in gerrit_collections:
            db.drop_collection(project.lower())
        db[project.lower()].insert_many(table_dict)
    final_time = time.time()
    log.info(
        f"Retrieving Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )

예제 #14

0

파일 보기

파일: consensus.py 프로젝트: mlazarodominguez/syn

def task_hyper_fit_0(tasks, corpus, date_range_train, date_range_test,
                     common_kwargs):
    """
    Fit the hyperparameters of a set of run tasks (from the task module)

    Args:
        tasks (dict): A mapping from task names to its relevant metric
        corpus (str): The corpus used to select the tasks to take into account.
        date_range_train (2-tuple of 3-tuple of int): Training date range to take into account.
        date_range_test (2-tuple of 3-tuple of int): Test date range to take into account.
        common_kwargs (list of list of str): List of hyperparameters to adjust in common.

    Returns:
        - Tuple of 2-tuple: Pairs of parameters (with their names flattned) and their values.
        - dict of str to (dict, float): A mapping from task names to their configuration and the score.

    """

    client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else get_default_local_mongo_client()

    task_names = list(tasks.keys())

    data = list(client["incidences"]["tasks"].find({
        'type': {
            "$in": task_names
        },
        'kwargs.corpus':
        corpus,
        'kwargs.date_range_train':
        date_range_train,
        'kwargs.date_range_test':
        date_range_test,
    }))

    results = [[(d["kwargs"], d["results"][tasks[d["type"]]]) for d in data
                if d["type"] == task_type] for task_type in task_names]

    common_rank = rank_common_parameters(results, common_kwargs)

    best = common_rank[0][0]

    return best, {
        task: rank_specific_parameters(result, best)[0]
        for result, task in zip(results, tasks)
    }

예제 #15

0

파일 보기

파일: ConstituencyParser.py 프로젝트: mlazarodominguez/syn

def get_constituency_tree_raw_data(db_name, col_name, categorical=True):
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    log.debug(f"\n[INICIO EJECUCIÓN]")

    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else get_default_local_mongo_client()

    db = mongodb_client[db_name]
    col = db[col_name]

    if not categorical:
        query, fields = get_raw_data_query_and_projection()
    else:
        query, fields = get_raw_data_query_and_projection_categorical()

    return col.find(query, fields)

예제 #16

0

파일 보기

파일: CreateMongoDBEclipseBaseClearCollectionSummaryFiels.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_base"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        # "assigned_to": 1,
        # "assigned_to_detail": 1,
        # "classification": 1,
        # "component": 1,
        # "creation_time": 1,
        # "creator": 1,
        # "creator_detail": 1,
        # "dupe_of": 1,
        "id": 1,
        # "op_sys": 1,
        # "platform": 1,
        # "priority": 1,
        # "product": 1,
        # "resolution": 1,
        # "severity": 1,
        # "status": 1,
        "summary": 1
        # "version": 1,
        # "description": 1
    }

    data = col.find({}, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear_nlp = pd.DataFrame(list(data))

    print(clear_nlp)

    nltk.download("stopwords", quiet=True)

    clear_nlp["summary_split_alpha"] = clear_nlp["summary"].apply(
        lambda x: clean_doc_split(x))
    clear_nlp["summary_lower"] = clear_nlp["summary_split_alpha"].apply(
        lambda x: clean_doc_lower(x))
    clear_nlp["summary_punctuaction"] = clear_nlp["summary_lower"].apply(
        lambda x: clean_doc_punctuaction(x))
    clear_nlp["summary_trim"] = clear_nlp["summary_punctuaction"].apply(
        lambda x: clean_doc_trim(x))
    clear_nlp["summary_isalpha"] = clear_nlp["summary_trim"].apply(
        lambda x: clean_doc_isalpha(x))
    clear_nlp["summary_stop_words"] = clear_nlp["summary_isalpha"].apply(
        lambda x: clean_doc_stopW(x))
    clear_nlp["summary_diacritic"] = clear_nlp["summary_stop_words"].apply(
        lambda x: clean_doc_diacri(x))
    clear_nlp["summary_lemmatizer"] = clear_nlp["summary_diacritic"].apply(
        lambda x: clean_doc_lem(x))

    # Almacena el dataframe en MongoDB.
    db["eclipse_base_summary_clear"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #17

0

파일 보기

    labels_value_counts = df_labels['label'].value_counts()
    log.info(f"Number of distinct label values: {labels_value_counts.shape[0]}")

    df_distinct_labels = pd.DataFrame(
        data=labels_value_counts.keys().to_list(),
        columns=['label']
    )

    # converting type of label column to 'category'
    df_distinct_labels['label'] = df_distinct_labels['label'].astype('category')

    # Assigning numerical values and storing in another column
    df_distinct_labels['label_code'] = df_distinct_labels['label'].cat.codes

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client() if os.environ['WORK_ENVIRONMENT'] == 'aws' \
        else MongoClient(host='localhost', port=27017)

    db = mongodb_client[input_params['corpus']]
    col = db[f"{input_params['task']}_task_labels"]

    log.debug(f"Existent collections in '{db.name}': {str(db.list_collection_names())}")

    if col.name in db.list_collection_names():
        db.drop_collection(col.name)

    log.info(f"Inserting documents ...")

    inserted_documents = col.insert_many(df_distinct_labels.to_dict("records"))

    log.info(f"Inserted documents: {len(inserted_documents.inserted_ids)}")

예제 #18

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding similar issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    collection = db[input_params['collection_name']]
    data = collection.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))

    # Check empty Dataframe.
    if 0 == df.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{collection.name}' collection.")

    similar_column_name = 'sim_bugs' if not input_params[
        'near_issues'] else 'near_bugs'
    # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues.
    pairs = generate_pairs(df, similar_column_name)
    log.info(f"Pairs generated: {len(pairs)}")

    # Dataframe pairs.
    df_pairs = pd.DataFrame(pairs)

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_pairs, num_batches)

    output_db = mongodb_client[input_params['output_db_name']]
    output_collection = output_db[input_params['output_similar_collection_name']] if not input_params['near_issues'] \
        else output_db[input_params['output_near_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in output_db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )

예제 #19

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding similar issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    collection = db[input_params['collection_name']]
    data = collection.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))

    # Check empty Dataframe.
    if 0 == df.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{collection.name}' collection.")

    # Initialize empty list column.
    df['sim_bugs'] = [[] for i in range(len(df))]
    df['near_bugs'] = [[] for i in range(len(df))]

    # Iterate over all rows.
    all_sim_bugs = []
    all_near_bugs = []
    for i in tqdm(range(df.shape[0])):
        # Compare each row with all rows.
        for j in range(df.shape[0]):
            if j == i:
                continue
            bug_anc = df.at[i, 'bug_id']
            bug_pos = df.loc[j, 'bug_id']
            jaccard_similarity = jaccard_score(df.loc[i, 'file_list'],
                                               df.loc[j, 'file_list'])
            if jaccard_similarity >= float(
                    input_params['similarity_threshold']):
                if [bug_anc, bug_pos] not in all_sim_bugs and [
                        bug_pos, bug_anc
                ] not in all_sim_bugs:
                    df.at[i, 'sim_bugs'].append(int(bug_pos))
                    all_sim_bugs.append([bug_anc, bug_pos])

            if float(input_params['similarity_threshold']) - 0.25 <= jaccard_similarity < \
                    float(input_params['similarity_threshold']):
                if [bug_anc, bug_pos] not in all_near_bugs and [
                        bug_pos, bug_anc
                ] not in all_near_bugs:
                    df.at[i, 'near_bugs'].append(int(bug_pos))
                    all_near_bugs.append([bug_anc, bug_pos])

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df.shape[0] / input_params['batch_size'])
    batches = np.array_split(df, num_batches)

    output_collection = db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )

예제 #20

0

파일 보기

파일: useful_data.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Processing Gerrit issues ...")

    # MongoDB data.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    col = db[input_params['collection_name']]
    data = col.find({}, {'_id': 0})
    df = pd.DataFrame(list(data))
    df['bug_id'] = -1
    # Search for bugs that don't matches this pattern.
    bug_id_list = []
    loop = tqdm(range(df.shape[0]))
    for i in loop:
        matched_1 = re.search(r'\[\s*\+?(-?\d+)\s*]', df.loc[i, 'subject'])
        matched_2 = re.search(r'[Bb][Uu][Gg]\s[0-9]+', df.loc[i, 'subject'])
        is_match = bool(matched_1) + bool(matched_2)
        if bool(is_match):
            flag = True
            res = re.findall(r'[Bb][Uu][Gg]\s[0-9]+', df.iloc[i]['subject'])
            if not res:
                flag = False
                res = re.findall(r'\[\s*\+?(-?\d+)\s*]', df.iloc[i]['subject'])
            bug_id = int(res[0][4:]) if flag else int(res[0])
            if bug_id in bug_id_list:
                idx = df.index[df['bug_id'] == bug_id].tolist()
                if len(idx) > 1:
                    raise ValueError(
                        f"There are more than one bug with 'bug_id' = {bug_id}"
                    )
                previous_file_list = get_filtered_file_list(
                    df.iloc[idx[0]]['file_list'])
                current_file_list = get_filtered_file_list(
                    df.iloc[i]['file_list'])
                file_list = set(previous_file_list + current_file_list)
                df.at[idx[0], 'file_list'] = list(file_list)
            else:
                bug_id_list.append(bug_id)
                df.at[i, 'bug_id'] = bug_id
                df.at[i, 'file_list'] = get_filtered_file_list(
                    df.iloc[i]["file_list"])

    # Drop collection if already exists.
    if input_params['output_collection_name'] in db.list_collection_names():
        log.info(
            f"Dropping collection {input_params['output_collection_name']} ..."
        )
        db.drop_collection(input_params['output_collection_name'])

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df.shape[0] / input_params['batch_size'])
    batches = np.array_split(df.loc[df['bug_id'] != -1], num_batches)

    # Insert documents with bug_id in MongoDB.
    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = db[
            input_params['output_collection_name']].insert_many(
                batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    final_time = time.time()
    log.info(
        f"Processing Gerrit issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )

예제 #21

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["eclipse"]
    col = db["clear"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "bug_id": 1,
        # "product": 1,
        # "description": 1,
        # "bug_severity": 1,
        # "dup_id": 1,
        "short_desc": 1,
        # "priority": 1,
        # "version": 1,
        # "component": 1,
        # "delta_ts": 1,
        "bug_status": 1
        # "creation_ts": 1,
        # "resolution": 1
    }

    # Se utilizarán sólo las incidencias resueltas.
    query = {
        '$or': [{
            'bug_status': {
                '$eq': 'RESOLVED'
            }
        }, {
            'bug_status': {
                '$eq': 'VERIFIED'
            }
        }]
    }

    data = col.find(query, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear_nlp = pd.DataFrame(list(data))

    print(clear_nlp)

    nltk.download("stopwords", quiet=True)

    clear_nlp["short_desc_split_alpha"] = clear_nlp["short_desc"].apply(
        lambda x: clean_doc_split(x))
    clear_nlp["short_desc_lower"] = clear_nlp["short_desc_split_alpha"].apply(
        lambda x: clean_doc_lower(x))
    clear_nlp["short_desc_punctuaction"] = clear_nlp["short_desc_lower"].apply(
        lambda x: clean_doc_punctuaction(x))
    clear_nlp["short_desc_trim"] = clear_nlp["short_desc_punctuaction"].apply(
        lambda x: clean_doc_trim(x))
    clear_nlp["short_desc_isalpha"] = clear_nlp["short_desc_trim"].apply(
        lambda x: clean_doc_isalpha(x))
    clear_nlp["short_desc_stop_words"] = clear_nlp["short_desc_isalpha"].apply(
        lambda x: clean_doc_stopW(x))
    clear_nlp["short_desc_diacritic"] = clear_nlp[
        "short_desc_stop_words"].apply(lambda x: clean_doc_diacri(x))
    clear_nlp["short_desc_lemmatizer"] = clear_nlp[
        "short_desc_diacritic"].apply(lambda x: clean_doc_lem(x))

    # Elimina la columna 'bug_status' porque se realizará después un merge con la colección original.
    clear_nlp.drop('bug_status', axis=1, inplace=True)

    # Almacena el dataframe en MongoDB.
    db["clear_short_desc_nlp"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #22

0

파일 보기

파일: NormalizeText.py 프로젝트: mlazarodominguez/syn

 def get_mongo_client(environment):
     if environment == 'local':
         return MongoClient(host='localhost', port=27017)
     else:
         return get_default_mongo_client()

예제 #23

0

파일 보기

파일: update_normalized_clear_all_years.py 프로젝트: mlazarodominguez/syn

if __name__ == "__main__":
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    log.info(f"Updating all normalized_clear years ...")

    # Load the parameters.
    input_params = get_input_params()
    assert input_params is not None, f"No params provided."

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[input_params['corpus']]
    col_name = 'normalized_clear_updated' if input_params[
        'closed_states'] else 'normalized_clear_all_states'
    col = db[col_name]

    if input_params[
            'drop_collection'] and col.name in db.list_collection_names():
        log.info(f"Dropping collection '{db.name}.{col.name}'")
        db.drop_collection(col.name)

    # Defines Python executable.
    for year in range(2000, 2021):
        cmd = get_command(os.name, input_params['corpus'], year,
                          input_params['closed_states'])

예제 #24

0

파일 보기

파일: SelectMongoDBBugzillaFields.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_all"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "assigned_to": 1,
        "assigned_to_detail": 1,
        "classification": 1,
        "component": 1,
        "creation_time": 1,
        "creator": 1,
        "creator_detail": 1,
        "dupe_of": 1,
        "id": 1,
        "op_sys": 1,
        "platform": 1,
        "priority": 1,
        "product": 1,
        "resolution": 1,
        "severity": 1,
        "status": 1,
        "summary": 1,
        "version": 1,
        "description": "$comments.text"
    }
    aggregation_project = {"$project": fields}

    # Se utilizarán sólo las incidencias resueltas.

    aggregation_match = {"$match": {
        '$and': [
            {
                '$or': [
                    {'status': {'$eq': 'RESOLVED'}},
                    {'status': {'$eq': 'VERIFIED'}},
                    {'status': {'$eq': 'CLOSED'}}
                ]
            },
            {'comments': {'$exists': 'true'}},
            {'comments': {'$ne': 'null'}},
            {'comments': {'$ne': ""}},
            {'comments': {'$not': {'$size': 0}}},
            {'comments.count': {'$eq': 0}}
        ]
    }}
    aggregation_unwind = {"$unwind": "$comments"}
    aggregation_limit = {"$limit": 20}

    # data = col.find(query, fields).limit(20)
    data = col.aggregate([
        aggregation_unwind,
        aggregation_match,
        # aggregation_limit,
        aggregation_project
    ])

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(data))

    # Almacena el dataframe en MongoDB.
    db["eclipse_base"].insert_many(eclipse_base.to_dict('records'))

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #25

0

파일 보기

파일: CreateDerivedMongoDBBugzillaFields.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["bugzilla"]
    col = db["eclipse_all"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
        "assigned_to": 1,
        "assigned_to_detail": 1,
        "classification": 1,
        "component": 1,
        "creation_time": 1,
        "creator": 1,
        "creator_detail": 1,
        "dupe_of": 1,
        "id": 1,
        "op_sys": 1,
        "platform": 1,
        "priority": 1,
        "product": 1,
        "resolution": 1,
        "severity": 1,
        "status": 1,
        "summary": 1,
        "version": 1,
        "description": "$comments.text"
    }
    aggregation_project = {"$project": fields}

    # Se utilizarán sólo las incidencias resueltas.
    aggregation_match = {"$match": {
        '$and': [
            {
                '$or': [
                    {'status': {'$eq': 'RESOLVED'}},
                    {'status': {'$eq': 'VERIFIED'}},
                    {'status': {'$eq': 'CLOSED'}}
                ]
            },
            {'comments.count': {'$eq': 0}}
        ]
    }}
    aggregation_unwind = {"$unwind": "$comments"}
    aggregation_limit = {"$limit": 20}

    # data = col.find(query, fields).limit(20)
    data = col.aggregate([
        aggregation_unwind,
        aggregation_match,
        aggregation_limit,
        aggregation_project
    ])

    # Expande el cursor y construye el DataFrame
    eclipse_base = pd.DataFrame(list(data))

    print(eclipse_base.head(10))

    # 1º-Convertir la variable en tipo categórica:
    eclipse_base.priority = eclipse_base.priority.astype('category')
    print(eclipse_base.head(10))

    # 2º-Catergorizar:
    eclipse_base['priority_cod'] = eclipse_base['priority'].cat.codes

    print(eclipse_base.head(10))

    # 1- Primero creas una instancia:
    #    le = sklearn.preprocessing.LabelEncoder()
    # 2- Después ajustas a tus datos:
    #    le.fit(labels)  (en este caso, "labels = data[:,0]" era la columna con las predicciones).
    # 3- Obtienes la columna con los valores transformados:
    #    labels = le.transform(labels)
    # 4- Puedes guardar los valores originales para establecer una relación:
    #    class_names = le.classes_


    # df["creation_time_year"] = df["creation_time"].str[:4]
    # df["last_change_time_year"] = df["last_change_time"].str[:4]
    # df["resolution_string"] = df["resolution"].apply(lambda y: "EMPTY_FIELD" if len(y) == 0 else y)

    final_time = time.time()
    log.info(f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #26

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Finding duplicate issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Bugzilla data.
    db = mongodb_client[input_params['input_db_name']]

    # Collection all issues (duplicated and no duplicated).
    original_collection = db[input_params['original_collection_name']]
    query = {}
    projection = {'_id': 0, 'bug_id': 1}
    log.info(f"Reading data from '{db.name}.{original_collection.name}' ...")
    original_data = original_collection.find(query, projection)
    df_original = pd.DataFrame(list(original_data))

    # Check empty Dataframe.
    if 0 == df_original.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{db.name}.{original_collection.name}' collection.")

    # List with all bug_id.
    original_bug_id_list = df_original['bug_id'].to_list()

    collection_pairs_first_step = db[input_params['input_collection_name']]
    query = {}
    projection = {'_id': 0}
    data_pairs_first_step = collection_pairs_first_step.find(query, projection)
    df_pairs_first_step = pd.DataFrame(list(data_pairs_first_step))

    # Check empty Dataframe.
    if 0 == df_pairs_first_step.shape[0]:
        raise ValueError(
            f"No documents have been retrieved from "
            f"'{db.name}.{collection_pairs_first_step.name}' collection.")

    bug1_list = df_pairs_first_step['bug1'].to_list()
    bug2_list = df_pairs_first_step['bug2'].to_list()

    no_duplicate_bug_id_list = set(original_bug_id_list) - set(
        bug1_list) - set(bug2_list)

    # For each issue that have similar issues generate a pair of similar issues and a pair of no similar issues.
    pairs_from_indirect_relations = check_indirect_relations(
        df_pairs_first_step, list(no_duplicate_bug_id_list))
    log.info(f"Pairs generated: {len(pairs_from_indirect_relations)}")

    # Dataframe pairs.
    df_pairs = pd.DataFrame(pairs_from_indirect_relations)

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_pairs.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_pairs, num_batches)

    output_collection = db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if output_collection.name in db.list_collection_names():
        log.info(
            f"Dropping collection '{db.name}.{output_collection.name}' ...")
        db.drop_collection(output_collection.name)

    inserted_docs_number = 0
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(
            batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")

    final_time = time.time()
    log.info(
        f"Finding similar issues total execution time = {((final_time - initial_time) / 60)} minutes"
    )

예제 #27

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()
    # Define el logger que se utilizará.
    logger = log4p.GetLogger(__name__)
    log = logger.logger

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Carga el fichero de configuración para el entorno.
    env_path = Path(ROOT_DIR) / 'config' / (SYN_ENV + '.env')
    load_dotenv(dotenv_path=env_path)

    # Inicializa los parámetros MongoDB para almacenar las estadísticas.
    mongodb_client: MongoClient = get_default_mongo_client()
    db = mongodb_client["eclipse"]
    clear_col = db["clear"]

    # Campos seleccionados para los trabajos abordados en el proyecto de investigación.
    fields = {
        "_id": 0,
    }

    # Se utilizarán sólo las incidencias resueltas.
    query = {
        '$or': [{
            'bug_status': {
                '$eq': 'RESOLVED'
            }
        }, {
            'bug_status': {
                '$eq': 'VERIFIED'
            }
        }]
    }

    clear_data = clear_col.find(query, fields)
    # .limit(20)

    # Expande el cursor y construye el DataFrame
    clear = pd.DataFrame(list(clear_data))

    clear_short_desc_nlp_col = db["clear_short_desc_nlp"]
    clear_short_desc_nlp_data = clear_short_desc_nlp_col.find({}, fields)
    clear_short_desc_nlp = pd.DataFrame(list(clear_short_desc_nlp_data))

    clear_description_nlp_col = db["clear_description_nlp_2"]
    clear_description_nlp_data = clear_description_nlp_col.find({}, fields)
    clear_description_nlp = pd.DataFrame(list(clear_description_nlp_data))

    # Merge de los dataframes
    clear_short_desc_description_nlp = pd.merge(clear_short_desc_nlp,
                                                clear_description_nlp,
                                                on='bug_id')
    clear_nlp = pd.merge(clear, clear_short_desc_description_nlp, on='bug_id')

    # Almacena el dataframe en MongoDB.
    db["clear_nlp"].insert_many(clear_nlp.to_dict('records'))

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutos")

예제 #28

0

파일 보기

파일: CreateGerritMongoDBCollection.py 프로젝트: mlazarodominguez/syn

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    # Mongo client.
    mongodb_client: MongoClient = get_default_mongo_client()

    db = mongodb_client[os.environ.get('GERRIT_DB_NAME',
                                       input_params['db_name'])]
    year = input_params['year']
    col = db[f"{input_params['collection_name']}" f"_{year}_{year + 1}"]

    log.debug(
        f"Existent collections in '{db.name}': {str(db.list_collection_names())}"
    )
    if col.name in db.list_collection_names():
        db.drop_collection(col.name)

    # Get Gerrit Issues by month.
    max_year = year
    for month in range(input_params['start_month'],
                       input_params['end_month'] + 1):
        max_month = month + 1
        if max_month > 12:
            max_month = 1
            max_year += 1

        after = str(datetime.datetime(input_params['year'], month, 1))
        before = str(datetime.datetime(max_year, max_month, 1))
        s = 0
        change_id, project, status, date, subject, author, committer, commit_msg = [], [], [], [], [], [], [], []
        file_list = []
        skip = ''
        flag = True
        log.info(f"Retrieving issues in range: {after} - {before}")
        while flag:
            issues_by_month = get_data_from_gerrit_rest_api(
                os.environ['GERRIT_API_URL'], after, before, skip=skip)
            log.info(f"Read issues: {len(issues_by_month)}")
            if len(issues_by_month) < 1:
                print('No issues for month {} and year {}'.format(month, year))
                flag = False
            else:
                for issue in issues_by_month:
                    change_id.append(issue['change_id'])
                    project.append(issue['project'])
                    status.append(issue['status'])
                    date.append(issue['updated'])
                    subject.append(issue['subject'])
                    current_rev = issue['current_revision']
                    author.append(issue['revisions'][current_rev]['commit']
                                  ['author']['email'])
                    committer.append(issue['revisions'][current_rev]['commit']
                                     ['committer']['email'])
                    commit_msg.append(
                        issue['revisions'][current_rev]['commit']['message'])
                    file_list.append(
                        list(issue['revisions'][current_rev]['files'].keys()))
                if len(issues_by_month) == 100:
                    last_dict = issues_by_month[99]
                    try:
                        if last_dict['_more_changes']:
                            log.info(f"There are more issues to read")
                            s = s + 100
                            skip = '&S=' + str(s)
                    except KeyError:
                        flag = False
                        print('Issues for month {} and year {}: {}'.format(
                            month, year, len(change_id)))
                else:
                    print('Issues for month {} and year {}: {}'.format(
                        month, year, len(change_id)))
                    flag = False
        dict_of_reg = {
            'change_id': change_id,
            'project': project,
            'status': status,
            'date': date,
            'subject': subject,
            'author': author,
            'committer': committer,
            'commit_msg': commit_msg,
            'file_list': file_list
        }
        df = pd.DataFrame(dict_of_reg)
        issues = df.to_dict("records")
        save_issues_to_mongodb(mongodb_collection=col, issues=issues)

    final_time = time.time()
    log.info(
        f"Total execution time = {((final_time - initial_time) / 60)} minutes")

예제 #29

0

파일 보기

def main():
    # Stores the execution start time to calculate the time it takes for the module to execute.
    initial_time = time.time()

    # Check if there is a running process that contains the name of this module.
    check_same_python_module_already_running(os.path.split(__file__))

    # Load parameters.
    input_params = get_input_params()

    log.info(f"Merging Gerrit and Bugzilla issues ...")

    # MongoDB client.
    mongodb_client: MongoClient = get_default_mongo_client()

    # Load Gerrit data.
    log.info(f"Loading Gerrit issues ...")
    tic = time.time()
    gerrit_db = mongodb_client[os.environ.get('GERRIT_DB_NAME', input_params['gerrit_db_name'])]
    gerrit_collection = gerrit_db[input_params['gerrit_collection_name']]
    gerrit_data = gerrit_collection.find({}, {'_id': 0})
    df_gerrit = pd.DataFrame(list(gerrit_data))
    log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.")

    # Check empty Dataframe.
    if 0 == df_gerrit.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{gerrit_db.name}.{gerrit_collection.name}' collection.")

    # Load Bugzilla data.
    log.info(f"Loading Bugzilla issues ...")
    tic = time.time()
    bugzilla_db = mongodb_client[os.environ.get('BUGZILLA_MONGODB_DATABASE_NAME', input_params['bugzilla_db_name'])]
    bugzilla_collection = bugzilla_db[input_params['bugzilla_collection_name']]
    bugzilla_data = bugzilla_collection.find({}, {'_id': 0, 'bug_id': 1})
    df_bugzilla = pd.DataFrame(list(bugzilla_data))
    log.info(f"Loading Gerrit issues total time: {(time.time() - tic) / 60} minutes.")

    # Check empty Dataframe.
    if 0 == df_bugzilla.shape[0]:
        raise ValueError(f"No documents have been retrieved from "
                         f"'{bugzilla_db.name}.{bugzilla_collection.name}' collection.")

    # Join on column 'bug_id'.
    log.info(f"Joining Gerrit and Bugzilla Dataframes ...")
    tic = time.time()
    df_joined = df_gerrit.merge(df_bugzilla, left_on='bug_id', right_on='bug_id')
    log.info(f"Joining Gerrit and Bugzilla Dataframes total time: {(time.time() - tic) / 60} minutes.")

    # Split dataframe in batches to avoid pymongo.errors.CursorNotFound.
    num_batches = math.ceil(df_joined.shape[0] / input_params['batch_size'])
    batches = np.array_split(df_joined, num_batches)

    output_collection = gerrit_db[input_params['output_collection_name']]

    # Drop collection if already exists.
    if input_params['output_collection_name'] in gerrit_db.list_collection_names():
        log.info(f"Dropping collection {input_params['output_collection_name']} ...")
        gerrit_db.drop_collection(input_params['output_collection_name'])

    inserted_docs_number = 0
    tic = time.time()
    for batch in batches:
        log.info(f"Inserting documents ...")
        inserted_documents = output_collection.insert_many(batch.to_dict("records"))
        inserted_docs_number += len(inserted_documents.inserted_ids)

    log.info(f"Inserted documents: {inserted_docs_number}")
    log.info(f"Inserting documents total time: {(time.time() - tic) / 60} minutes.")

    final_time = time.time()
    log.info(f"Merging Gerrit and Bugzilla issues total execution time = {((final_time - initial_time) / 60)} minutes")