Пример #1
0
def all_promoted_patterns():
    patterns_query = db_onto.find(projection={
        'promoted_patterns': True,
        '_id': False
    })

    patterns = list(flatten([list(d.values()) for d in list(patterns_query)]))

    df_patterns = pd.DataFrame(
        list(
            db_ap.find({
                "ctx_pattern": {
                    "$in": [p for p in patterns]
                },
                "counter": {
                    "$gt": 1
                }
            })))

    return df_patterns
Пример #2
0
def all_promoted_instances():
    instances_query = db_onto.find(projection={
        'promoted_instances': True,
        '_id': False
    })

    instances = list(flatten([list(d.values())
                              for d in list(instances_query)]))

    df_instances = pd.DataFrame(
        list(
            db_ap.find({
                "noun_phrase": {
                    "$in": [i for i in instances]
                },
                "counter": {
                    "$gt": 1
                }
            })))

    return df_instances
Пример #3
0
def promote_instances(category, iteration, max_promotions, limit, T,
                      df_all_promoted_patterns):

    last_promoted_patterns = category['promoted_patterns'][iteration - 1]
    promoted_instances = list(flatten(category['promoted_instances']))
    promoted_patterns = list(flatten(category['promoted_patterns']))

    #get the promoted patterns of the mutex exception categories
    mutex_query = (db_onto.find(
        {'category_name': {
            "$in": category['mutex_exceptions']
        }}))

    mutex_patterns = list(
        flatten([i['promoted_patterns'] for i in mutex_query]))

    #extraction step
    #count the ocurrences of instances that co-occur with the
    #positive promoted patterns in the last iteration
    #without considering instances that were already promoted
    pos = list(
        db_ap.find({
            "ctx_pattern": {
                "$in": last_promoted_patterns
            },
            "noun_phrase": {
                "$nin": promoted_instances
            },
            "counter": {
                "$gt": 1
            }
        }))

    if (pos):  #if at least one positive and one negative pattern was found

        df_pos = (pd.DataFrame(pos).groupby('noun_phrase')['counter'].sum().
                  sort_values(ascending=False).head(limit).rename('count_pos'))

        #count the ocurrences of instances that co-occur with negative patterns
        df_neg = (
            df_all_promoted_patterns[~df_all_promoted_patterns['ctx_pattern'].
                                     isin(promoted_patterns + mutex_patterns)].
            groupby('noun_phrase')['counter'].sum().rename('count_neg'))

        joined = (
            pd.concat([df_pos, df_neg], axis=1, sort=False).fillna(0).assign(
                filter_check=lambda df: (df['count_pos'] >= df['count_neg'] * T
                                         ) &  # filter criterion #1
                (df['count_pos'] >= 2)))  # filter criterion #2

        new_instances = list((
            joined[joined['filter_check']]  # filter step
            .sort_values(by='count_pos', ascending=False)  # rank step
            .head(max_promotions)  # promote step
            .index.values))

        #update ontology with the promoted instances
        db_onto.update_one(
            {'category_name': category['category_name']},
            {'$set': {
                'promoted_instances.' + str(iteration): new_instances
            }})
    else:
        new_instances = []

    return new_instances
Пример #4
0
def main():

    num_iter = cpl_conf.num_iter  # number of iterations
    max_p = cpl_conf.max_p_promotions  # max pattern promotions per iteration
    max_i = cpl_conf.max_i_promotions  # max instance promotions per iteration
    l = cpl_conf.limit  # max number of positive candidates for promotion
    T = cpl_conf.T  # multiplier of promotion threshold

    #load category metadata
    categories_init = db_onto.find(
        projection=['category_name', 'seed_instances', 'seed_ctx_pattern'])

    for i in range(num_iter):  # for i iterations

        i_start_time = time.time()

        df_all_promoted_instances = pr.all_promoted_instances()
        df_all_promoted_patterns = pr.all_promoted_patterns()

        categories_init.rewind()

        if (i == 0):  #if first iteration
            pr.promote_seeds(categories_init)
        else:
            for c_init in categories_init:  # for all categories

                #load category information
                category = db_onto.find_one(
                    {'category_name': c_init['category_name']})

                if i <= len(
                        category['promoted_patterns']
                ):  #if there are positive patterns for this iteration
                    start = time.time()
                    pi = pr.promote_instances(category, i, max_i, l, T,
                                              df_all_promoted_patterns)
                    end = time.time()
                    print('instance',
                          i,
                          c_init['category_name'],
                          len(pi),
                          end - start,
                          sep=',')

                if i <= len(
                        category['promoted_instances']
                ):  #if there are positive patterns for this iteration
                    start = time.time()
                    pp = pr.promote_patterns(category, i, max_p, l, T,
                                             df_all_promoted_instances)
                    end = time.time()
                    print('pattern',
                          i,
                          c_init['category_name'],
                          len(pp),
                          end - start,
                          sep=',')

        i_end_time = time.time()

    db.close()