예제 #1
0
def sampling_cate_topics(regions, size, g_percentages):
    """ Sampling poi topics from the database
    """
    topics = pd.DataFrame(columns=TOPIC_SCHEMA)
    checkins = None
    cate_set = set()
    for r in regions:
        kbase = KnowledgeBase.fromMongo(db.checkin, r['value'])
        if checkins is not None:
            cate_set = set(kbase.checkins['cid'].unique())
            checkins = checkins.append(kbase.checkins, ignore_index=True)
        else:
            cate_set &= set(kbase.checkins['cid'].unique())
            checkins = kbase.checkins
    _LOGGER.info('%d checkins loaded for cate_topics', len(checkins))
    checkins.drop_duplicates(cols=['pid', 'user'], inplace=True)
    for zcate, group in checkins.groupby('z_category'):
        cidgroup = [
            cid + '\t' + cname
            for cid, cname in group[['cid', 'category']].values
            if cid in cate_set
        ]
        for gid, g in enumerate(
                stratified_samples(cidgroup, g_percentages, size / 9)):
            for s in g:
                cid, cname = s.split('\t')
                for r in regions:
                    topics = topics.append([{
                        'topic_id': CATE_ID.next(),
                        'topic': cname,
                        'region': r['name'],
                        'associate_id': cid,
                        'zcategory': zcate,
                        'group': gid
                    }])
    _LOGGER.info('# CATE_topics: %d', len(topics))
    for zcate, group in checkins.groupby('z_category'):
        for r in regions:
            topics = topics.append([{
                'topic_id': ZCATE_ID.next(),
                'topic': zcate,
                'region': r['name'],
                'associate_id': group['zcid'].values[0],
                'zcategory': zcate
            }])
    _LOGGER.info('# Total Cate_topics: %d', len(topics))
    return topics
예제 #2
0
def sampling_cate_topics(regions, size, g_percentages):
    """ Sampling poi topics from the database
    """
    topics = pd.DataFrame(columns=TOPIC_SCHEMA)
    checkins = None
    cate_set = set()
    for r in regions:
        kbase = KnowledgeBase.fromMongo(db.checkin, r['value'])
        if checkins is not None:
            cate_set = set(kbase.checkins['cid'].unique())
            checkins = checkins.append(kbase.checkins, ignore_index=True)
        else:
            cate_set &= set(kbase.checkins['cid'].unique())
            checkins = kbase.checkins
    _LOGGER.info('%d checkins loaded for cate_topics', len(checkins))
    checkins.drop_duplicates(cols=['pid', 'user'], inplace=True)
    for zcate, group in checkins.groupby('z_category'):
        cidgroup = [cid + '\t' + cname
                    for cid, cname in group[['cid', 'category']].values
                    if cid in cate_set]
        for gid, g in enumerate(stratified_samples(cidgroup,
                                                   g_percentages,
                                                   size / 9)):
            for s in g:
                cid, cname = s.split('\t')
                for r in regions:
                    topics = topics.append([{'topic_id': CATE_ID.next(),
                                             'topic': cname,
                                             'region': r['name'],
                                             'associate_id': cid,
                                             'zcategory': zcate,
                                             'group': gid}])
    _LOGGER.info('# CATE_topics: %d', len(topics))
    for zcate, group in checkins.groupby('z_category'):
        for r in regions:
            topics = topics.append([{'topic_id': ZCATE_ID.next(),
                                     'topic': zcate,
                                     'region': r['name'],
                                     'associate_id': group['zcid'].values[0],
                                     'zcategory': zcate}])
    _LOGGER.info('# Total Cate_topics: %d', len(topics))
    return topics
예제 #3
0
def sampling_poi_topics(region, size, g_percentages):
    """ Sampling poi topics from the database
    """
    topics = pd.DataFrame(columns=TOPIC_SCHEMA)
    kbase = KnowledgeBase.fromMongo(db.checkin, region['value'])
    kbase.checkins.drop_duplicates(cols=['pid', 'user'], inplace=True)
    for zcate, group in kbase.checkins.groupby('z_category'):
        pidgroup = [pid + '\t' + pname
                    for pid, pname in group[['pid', 'place']].values]
        for gid, g in enumerate(stratified_samples(pidgroup,
                                                   g_percentages,
                                                   size / 9)):
            for s in g:
                pid, pname = s.split('\t')
                topics = topics.append([{'topic_id': POI_ID.next(),
                                         'topic': pname,
                                         'region': region['name'],
                                         'associate_id': pid,
                                         'zcategory': zcate,
                                         'group': gid}])
    _LOGGER.info('# POI_topics: %d', len(topics))
    return topics
예제 #4
0
def sampling_poi_topics(region, size, g_percentages):
    """ Sampling poi topics from the database
    """
    topics = pd.DataFrame(columns=TOPIC_SCHEMA)
    kbase = KnowledgeBase.fromMongo(db.checkin, region['value'])
    kbase.checkins.drop_duplicates(cols=['pid', 'user'], inplace=True)
    for zcate, group in kbase.checkins.groupby('z_category'):
        pidgroup = [
            pid + '\t' + pname for pid, pname in group[['pid', 'place']].values
        ]
        for gid, g in enumerate(
                stratified_samples(pidgroup, g_percentages, size / 9)):
            for s in g:
                pid, pname = s.split('\t')
                topics = topics.append([{
                    'topic_id': POI_ID.next(),
                    'topic': pname,
                    'region': region['name'],
                    'associate_id': pid,
                    'zcategory': zcate,
                    'group': gid
                }])
    _LOGGER.info('# POI_topics: %d', len(topics))
    return topics