def construct_train_set(attribute,training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    labeled_feature_file=open('%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    label_arbiter=LabelArbiter(labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute))
    labeled_features=dict()
    for line in labeled_feature_file:
        line=line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')]=map(lambda d:float(d.split(':')[1]),line[1:])
    collection=Connection().jd.train_users

    bar=progress_bar(collection.count())
    confidence=[]
    for index,user in enumerate(collection.find()):
        label_distributed=[1,1]
        for f,value in combine_features(user['mentions'],Counter('products')).items():
            if f in labeled_features:
                label_distributed[0]*=labeled_features[f][0]*value
                label_distributed[1]*=labeled_features[f][1]*value
        s=1.0*sum(label_distributed)
        if not s==0:
            label_distributed[0]/=s
            label_distributed[1]/=s
        label_distributed=label_arbiter.get_label_distribute(combine_features(user['mentions'],Counter('products')))
        if label_distributed[0]>label_distributed[1]:
            label=0
        elif label_distributed[0]<label_distributed[1]:
            label=1
        else:
            label=-1

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        str_features=' '.join(map(lambda f:'%s:%f'%f,sorted_feature))
        confidence.append(
                (user['_id'],
                    label,
                    abs(label_distributed[0]-label_distributed[1]),
                    str_features,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    confidence0=filter(lambda d:d[1]==0,confidence)
    confidence0=sorted(confidence0,key=lambda d:d[2],reverse=True)
    confidence1=filter(lambda d:d[1]==1,confidence)
    confidence1=sorted(confidence1,key=lambda d:d[2],reverse=True)
    confidence2=filter(lambda d:d[1]==-1,confidence)
    confidence2=sorted(confidence2,key=lambda d:d[4],reverse=True)

    dimention=min(len(confidence0),len(confidence1),training_count/2)
    confidence0=confidence0[:dimention]
    confidence1=confidence1[:dimention]
    confidence2=confidence2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in confidence0+confidence1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in confidence2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
def construct_train_set(attribute,training_count):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)
    test_uids=get_test_uids()

    labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)
    label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    guess=[]
    for index,user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features=combine_dict(user['mentions_0'],Counter('products'))
        label,confidence=label_arbiter.arbitrate_label(features)
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))
        guess.append(
                (user['_id'],
                    label,
                    abs(confidence),
                    str_x,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    data0=filter(lambda d:d[1]==0,guess)
    data0=sorted(data0,key=lambda d:d[2],reverse=True)
    data1=filter(lambda d:d[1]==1,guess)
    data1=sorted(data1,key=lambda d:d[2],reverse=True)
    data2=filter(lambda d:d[1]==-1,guess)
    data2=sorted(data2,key=lambda d:d[4],reverse=True)

    dimention=min(len(data0),len(data1),training_count/2)

    data0=data0[:dimention]
    data1=data1[:dimention]
    data2=data2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in data0+data1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in data2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    label_arbiter = LabelArbiter(
        labeled_feature_file='%s/review_constraint_%s.constraints' %
        (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter('products')).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        label_distributed = label_arbiter.get_label_distribute(
            combine_features(user['mentions'], Counter('products')))
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute,
        'w')
    for d in confidence0 + confidence1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in confidence2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
def construct_train_set(attribute, training_count):
    product_features = get_features(feature_file=base_dir +
                                    '/features/product.feature')
    mention_features = get_features(feature_file=base_dir +
                                    '/features/mention.feature',
                                    existent_features=product_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=mention_features)
    mention_features_1 = get_features(feature_file=base_dir +
                                      '/features/mention_1.feature',
                                      existent_features=review_featuers)
    mention_features_2 = get_features(feature_file=base_dir +
                                      '/features/mention_2.feature',
                                      existent_features=mention_features_1)
    test_uids = get_test_uids()

    labeled_feature_file = '%s/review_constraint_%s.constraints' % (
        labeled_feature_file_dir, attribute)
    label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    guess = []
    for index, user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features = combine_dict(user['mentions_0'], Counter('products'))
        label, confidence = label_arbiter.arbitrate_label(features)
        x = []

        #user['products']=[]
        for f, v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f], v))

        #user['mentions_0']={}
        for f, v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f], v))

        #user['review']=[]
        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f], v))

        user['mentions_1'] = {}
        for f, v in user['mentions_1'].items():
            f = f + '_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f], v))

        user['mentions_2'] = {}
        for f, v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f], v))

        x = sorted(x, key=lambda d: d[0])
        str_x = ' '.join(map(lambda f: '%s:%f' % f, x))
        guess.append((
            user['_id'],
            label,
            abs(confidence),
            str_x,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    data0 = filter(lambda d: d[1] == 0, guess)
    data0 = sorted(data0, key=lambda d: d[2], reverse=True)
    data1 = filter(lambda d: d[1] == 1, guess)
    data1 = sorted(data1, key=lambda d: d[2], reverse=True)
    data2 = filter(lambda d: d[1] == -1, guess)
    data2 = sorted(data2, key=lambda d: d[4], reverse=True)

    dimention = min(len(data0), len(data1), training_count / 2)

    data0 = data0[:dimention]
    data1 = data1[:dimention]
    data2 = data2[:dimention]

    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute,
        'w')
    for d in data0 + data1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in data2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])