예제 #1
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
예제 #2
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
예제 #3
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(self_training_file_dir+'test_%s.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
def construct_test_set(attribute):
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
예제 #5
0
def statistics(attribute,
               threshold=-1,
               feature_file_name=base_dir + '/features/mention.feature',
               show=False):
    import random
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    labels_distribute = [0., 0.]
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        if len(features) < 10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label] += 1  #features[f]
        labels_distribute[label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0] /= labels_distribute[0]
        distribute[f][1] /= labels_distribute[1]
    for f in distribute:
        s = sum(distribute[f])
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute = sorted(distribute.items(),
                        key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) /
                                          (sum(d[1]) + 0.1)),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
예제 #6
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
def construct_test_set(attribute):
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
예제 #8
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
예제 #9
0
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    labels_distribute=[0.,0.]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        if len(features)<10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label]+=1#features[f]
        labels_distribute[label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0]/=labels_distribute[0]
        distribute[f][1]/=labels_distribute[1]
    for f in distribute:
        s=sum(distribute[f])
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
def construct_test_set(attribute):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)

    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print 'Balance params: ',balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        #if random.random()>balance_params[label]:
        #    continue

        '============'
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))

        fout.write('%d %s\n'%(label,str_x))
        uid_output.write('%s\n'%(user['_id']))
        bar.draw(index+1)
def construct_test_set(attribute):
    product_features = get_features(feature_file=base_dir +
                                    '/features/product.feature')
    mention_features = get_features(feature_file=base_dir +
                                    '/features/mention.feature',
                                    existent_features=product_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=mention_features)
    mention_features_1 = get_features(feature_file=base_dir +
                                      '/features/mention_1.feature',
                                      existent_features=review_featuers)
    mention_features_2 = get_features(feature_file=base_dir +
                                      '/features/mention_2.feature',
                                      existent_features=mention_features_1)

    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print 'Balance params: ', balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        #if random.random()>balance_params[label]:
        #    continue

        '============'
        x = []

        #user['products']=[]
        for f, v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f], v))

        #user['mentions_0']={}
        for f, v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f], v))

        #user['review']=[]
        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f], v))

        user['mentions_1'] = {}
        for f, v in user['mentions_1'].items():
            f = f + '_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f], v))

        user['mentions_2'] = {}
        for f, v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f], v))

        x = sorted(x, key=lambda d: d[0])
        str_x = ' '.join(map(lambda f: '%s:%f' % f, x))

        fout.write('%d %s\n' % (label, str_x))
        uid_output.write('%s\n' % (user['_id']))
        bar.draw(index + 1)