def output_graph_matrix():
    from pymongo import Connection
    users = Connection().user_profilling.users
    graph = Connection().user_profilling.graph_embedding
    print graph.count()
    bar = get_progressive_bar(users.count())
    x = []
    y = []
    finish_count = 0
    uids = []
    for user in users.find({'int_id': {
            '$exists': True
    }}, {
            'information': 1,
            'int_id': 1
    }):
        finish_count += 1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding = graph.find_one({'_id': user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender = user['information']['gender']
        if gender == 'f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x, y, uids, 'user_graph_vector.data')
示例#2
0
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'):
    vectors = dict()
    fin = open(file_name)
    line = fin.readline().strip().split(' ')
    count, dimention = int(line[0]), int(line[1])
    bar = progress_bar(count)
    for index in xrange(count):
        line = fin.readline()
        line = line.strip().split(' ')
        vector = map(lambda d: float(d), line[1:])
        vectors[line[0]] = vector
        bar.draw(index + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            vectors[user['_id']] = [0.] * dimention
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
def output_graph_matrix():
    from pymongo import Connection
    users=Connection().user_profilling.users
    graph=Connection().user_profilling.graph_embedding
    print graph.count()
    bar=get_progressive_bar(users.count())
    x=[]
    y=[]
    finish_count=0
    uids=[]
    for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}):
        finish_count+=1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding=graph.find_one({'_id':user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender=user['information']['gender']
        if gender=='f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x,y,uids,'user_graph_vector.data')
示例#4
0
class AllTest(unittest2.TestCase):
    def setUp(self):
        self.col0 = Connection("127.0.0.1", 27017)["algolab-test"]["rg0"]
        self.col1 = Connection("127.0.0.1", 27017)["algolab-test"]["rg1"]
        self.col2 = Connection("127.0.0.1", 27017)["algolab-test"]["rg2"]
        self.col0.drop()
        self.col1.drop()
        self.col2.drop()

        create_rg(npoints[2], self.col0, distance_function=edist)
        create_rg(npoints[5], self.col0, distance_function=edist)

        create_rg(npoints[2], self.col1, distance_function=edist)
        create_rg(npoints[5], self.col1, distance_function=edist)
        create_rg(npoints[3], self.col1, distance_function=edist)
        create_rg(npoints[4], self.col1, distance_function=edist)

    def test_rdp(self):
        segments = S(self.col0).segments
        for seg in segments:
            sloc = locs_for(seg, self.col0)
            create_rg(rdp(sloc, 0), self.col2)

        self.assertEqual(self.col2.count(), 8)

    def test_rdp2(self):
        segments = S(self.col1).segments

        for seg in segments:
            sloc = locs_for(seg, self.col1)
            create_rg(rdp(sloc, 0), self.col2)

        self.assertEqual(self.col2.count(), 11)

    def test_rdp3(self):
        segments = S(self.col1).segments
        for seg in segments:
            sloc = locs_for(seg, self.col1)
            create_rg(rdp(sloc, 100000), self.col2)

        self.assertEqual(self.col2.count(), 8)

    def test_anglered(self):
        segments = S(self.col1).segments
        for seg in segments:
            sloc = locs_for(seg, self.col1)
            create_rg(anglereduce(sloc, 1), self.col2)

        self.assertEqual(self.col2.count(), 8)

    def test_anglered2(self):
        segments = S(self.col1).segments
        for seg in segments:
            sloc = locs_for(seg, self.col1)
            create_rg(anglereduce(sloc, 180), self.col2)

        self.assertEqual(self.col2.count(), 11)
示例#5
0
def output_vector(entity_name):
    collection = Connection().jd[entity_name]
    bar = progress_bar(collection.count())
    mentions = get_mentions()
    fout = open(RAW_DATA_DIR + '%s_init_vec.data' % entity_name, 'w')
    fout.write('%d %d\n' % (collection.count(), len(mentions)))
    for index, entity in enumerate(collection.find()):
        reviews = ' '.join(set(map(lambda r: r[1], entity['records'])))
        vector = map(lambda m: reviews.count(m), mentions)
        if numpy.any(vector):
            fout.write(
                '%s %s\n' %
                (entity['_id'], ' '.join(map(lambda d: str(d), vector))))
        bar.draw(index + 1)
示例#6
0
def generate_name_feature():
    from pymongo import Connection
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    y = []
    for user in users.find():
        name = user['screen_name']
        normal_name = ''
        for n in name:
            if n[0] in lastnames:
                normal_name = n
            else:
                continue
        if normal_name == '':
            continue
        if len(normal_name) < 2:
            continue
        corpus.append(normal_name[1:])
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    feature_selection_df(corpus)
def output_user_user_propagate_vectors(order):
    from pymongo import Connection
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    vectors=load_user_user_graph_propagate_vector(order)
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        try:
            vector=vectors[int(user['_id'])]
        except:
            continue
        if not vector.any():
            continue
        #y=get_location_class(user['location'],key_map)
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_user_user_propagate'+str(order))
    return
    return dump_train_valid_test(all_x, all_y, 'jd_user_user_propagate')
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    y = []
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description = user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender'] == 'm':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x = x.toarray()
    all_data_y = numpy.array(y)
    dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def output_goods_class_matrix(order=0):
    from pymongo import Connection
    feature_map={}
    f=open('./features/item_class_order_%d.feature'%order).readlines()
    tmp_feature=[]
    for index,line in enumerate(f):
        tmp_feature.append(line.decode('utf8').split(' ')[0])
    for index,f in enumerate(tmp_feature):
        feature_map[f]=index
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        behaviors=user['behaviors']
        for behavior in behaviors:
            feature=behavior['item_class'][order-1]
            features.append(feature)
        vector=get_one_hot_vector(features,feature_map)
        if not vector.any():
            continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_item_class_order_'+str(order))
    return
示例#10
0
def output_shopping_tf_matrix(feature_length=3):
    from pymongo import Connection
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    count_male=0
    for user in users.find():
        vector=numpy.zeros((feature_length))
        tf=dict()
        for behavior in user['behaviors']:
            try:
                tf[behavior['timestamp']]+=1
            except:
                tf[behavior['timestamp']]=1
        if len(tf)<feature_length:
            continue
        tf=sorted(tf.iteritems(), key=lambda d:d[1], reverse=True)
        for i in range(0,feature_length):
            vector[i]=tf[i][1]
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    all_y=numpy.array(all_y)
    return dump_train_valid_test(all_x, all_y, 'jd_user_simple')
示例#11
0
def output_sentence_embedding_matrix(file_name1,file_name2):
    from pymongo import Connection
    all_x=[]
    index=0
    embedding=doc2vec_embedding(file_name1)
    #embedding=load_doc2vec_embedding(file_name1)
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    count_male=0
    for user in users.find():
        try:
            vector=embedding['USER_%d'%user['jd_id']]
        except:
            continue
        #if y==-1:
        #    continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    #return dump_train_valid_test(all_x, all_y, 'jd_user_embedding')
    #dump_user_vector(all_x, all_y, uids, 'jd_user_embedding_with_item_class')
    dump_user_vector(all_x, uids, file_name2)
示例#12
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#13
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#14
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    #confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    #dimention=min(len(confidence0),len(confidence1),training_count/2)
    #confidence0=confidence0#[:dimention]
    #confidence1=confidence1#[:dimention]
    print len(confidence0), len(confidence1)
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w')
    #for d in confidence0+confidence1:
    for d in confidence:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
示例#15
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#16
0
def construct_test_data(attribute):
    collection = Connection().jd.test_users
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    data = []
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        try:
            y = user['profile'][attribute].index(1)
        except:
            continue
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]

        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        data.append([uid, y, x])
        bar.draw(index + 1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w')
    for index,user in enumerate(collection.find()):
        label=0
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        features=combine_features(user['mentions_1'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
示例#18
0
def construct_test_data(attribute):
    collection=Connection().jd.test_users
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    data=[]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        features=combine_features(user['mentions'],Counter(user['products']))
        try:
            y=user['profile'][attribute].index(1)
        except:
            continue
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]

        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        data.append([uid,y,x])
        bar.draw(index+1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
示例#19
0
def construct_train_data():
    import random
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    data = []
    uids = get_test_uids()
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        if uid in uids:
            continue
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]
        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        y = random.randint(0, 1)
        data.append([uid, y, x])
        bar.draw(index + 1)
    output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
示例#20
0
def construct_train_data():
    import random
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    data=[]
    uids=get_test_uids()
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        if uid in uids:
            continue
        features=combine_features(user['mentions'],Counter(user['products']))
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]
        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        y=random.randint(0,1)
        data.append([uid,y,x])
        bar.draw(index+1)
    output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
示例#21
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(self_training_file_dir+'test_%s.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users=Connection().user_profilling.users
    bar=get_progressive_bar(users.count())
    corpus=[]
    finish_count=0
    y=[]
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description=user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender']=='m':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x=x.toarray()
    all_data_y=numpy.array(y)
    dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
示例#23
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data',
                      'w')
    for index, user in enumerate(collection.find()):
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        features = combine_features(user['mentions_1'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#24
0
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=dict()
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<2:
                continue
            if user['information']['gender']=='m':
                gender=1
            else:
                gender=0
            for w in n[1:]:
                if w not in tf:
                    tf[w]=[0,0]
                tf[w][gender]+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
示例#25
0
def output_review_star_matrix(feature_length=1000):
    from pymongo import Connection
    feature_map={}
    for i in range(0,6):
        feature_map[str(i)]=i
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        for behavior in user['behaviors']:
            feature=str(behavior['review']['review_stars'])
            features.append(feature)
        if features==[]:
            continue
        vector=get_one_hot_vector(features, feature_map)
        if not vector.any():
            continue
        #y=get_location_class(user['location'],key_map)
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_review_star')
    return
    return dump_train_valid_test(all_x, all_y, 'jd_review_star')
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=pickle.load(open('./tf.data'))
    x=[]
    y=[]
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        if finish_count>5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<3:
                continue
            try:
                x0=1.0*tf[n[1]][0]/sum(tf[n[1]])
                x1=1.0*tf[n[2]][0]/sum(tf[n[2]])
            except:
                continue
            if user['information']['gender']=='m':
                y.append(1)
            else:
                y.append(0)
            x.append([x0,x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x,y,'gender_name_simple.data')
示例#27
0
def output_review_embedding_matrix():
    from helper import get_mentions
    from pymongo import Connection
    from my_vector_reader import read_vectors
    all_x=[]
    #进度条相关参数
    users=Connection().jd.weibo_users
    bar=progress_bar(users.count())
    finish_count=0
    uids=[]
    mentions=get_mentions()
    #review_vocab,review_embedding=read_vectors('/mnt/data1/adoni/jd_data/vectors/word_vectors.data','utf8')
    review_vocab,review_embedding=read_vectors('../myword2vec/word_vectors.data','utf8')
    mentions=filter(lambda d:d in review_vocab,mentions)
    mention_embedding=map(lambda x:review_embedding[review_vocab.index(x)],mentions)
    vector_size=len(mention_embedding[0])
    for user in users.find():
        x=numpy.zeros(vector_size)
        review=' '.join(map(lambda d:d['review']['review_general'],user['behaviors']))
        for index,mention in enumerate(mentions):
            count=review.count(mention)
            x+=count*mention_embedding[index]
        if not x.any():
            continue
        all_x.append(x)
        uids.append(user['_id'])
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'user_review_embedding')
示例#28
0
def age_distribute():
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    weibo_collection=Connection().jd.weibo_users
    linked_jd_ids=dict()
    ages=[]
    for line in open('/mnt/data1/adoni/data/linked_uids.data'):
        linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        if sum(user['profile']['age'])==0:
            continue
        weibo_id=linked_jd_ids[user['_id']]
        weibo_user=weibo_collection.find_one({'_id':weibo_id})
        if weibo_user==None:
            continue
        age=2015-int(weibo_user['birthday'].split(u'年')[0])
        if age>50 or age<10:
            continue
        ages.append(age)
        if age<30:
            user['profile']['age']=[1,0]
        else:
            user['profile']['age']=[0,1]
        collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}})
        bar.draw(index)
    s=sum(Counter(ages).values())
    ages=sorted(Counter(ages).items(),key=lambda d:d[0])
    ss=0.
    for age in ages:
        ss+=age[1]
        print age[0],(ss)/s
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = dict()
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 2:
                continue
            if user['information']['gender'] == 'm':
                gender = 1
            else:
                gender = 0
            for w in n[1:]:
                if w not in tf:
                    tf[w] = [0, 0]
                tf[w][gender] += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
示例#30
0
def output_simple_matrix(feature_length=10000):
    from pymongo import Connection
    from collections import Counter
    feature_map={}
    f=open('./features/product.feature').readlines()
    for i in range(0,len(f)):
        if feature_length is not None and i>=feature_length:
            break
        feature_map[f[i].decode('utf8').split(' ')[0]]=i
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        for behavior in user['behaviors']:
            feature=str(int(behavior['item']))
            features.append(feature)
        vector=get_one_hot_light_vector(features, feature_map)
        if len(vector)==0:
            continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_user_simple',dimention=len(feature_map))
    return all_x,uids
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = pickle.load(open('./tf.data'))
    x = []
    y = []
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        if finish_count > 5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 3:
                continue
            try:
                x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]])
                x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]])
            except:
                continue
            if user['information']['gender'] == 'm':
                y.append(1)
            else:
                y.append(0)
            x.append([x0, x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x, y, 'gender_name_simple.data')
def construct_mallet_data(profile_key):
    from pymongo import Connection
    from my_progress_bar import progress_bar
    from collections import Counter
    users=Connection().jd.weibo_users
    bar=progress_bar(users.count())
    fout=open(MATRIXES_DIR+'mallet/construced_data.mallet','w')
    data=[]
    for index,user in enumerate(users.find()):
        try:
            label=user['profile'][profile_key].index(1)
        except:
            continue
        reviews=[]
        for behavior in user['behaviors']:
            #reviews.append('Pro'+str(behavior['item']))
            reviews+=behavior['parsed_review']['review_general']
        reviews=Counter(reviews)
        reviews=' '.join(map(lambda word:'%s:%d'%(word,reviews[word]),reviews.keys()))
        line='%s %d %s\n'%(user['_id'],label,reviews)
        data.append((label,line))
    data=balance(data,target_index=0)
    #balanced_data=data
    for label,line in balanced_data:
        fout.write(line.encode('utf8'))
        bar.draw(index)
示例#33
0
def output_user_product_graph():
    fout = open(RAW_DATA_DIR + 'graph.data', 'w')
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)

    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)
示例#34
0
def get_train_user_products():
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    user_products=dict()
    for index,user in enumerate(collection.find()):
        user_products[user['_id']]=dict(Counter(user['products']))
        #user_products[user['_id']]=user['mentions']
        bar.draw(index)
    return user_products
def construct_test_set(attribute):
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#36
0
def statistics(attribute,
               threshold=-1,
               feature_file_name=base_dir + '/features/mention.feature',
               show=False):
    import random
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    labels_distribute = [0., 0.]
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        if len(features) < 10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label] += 1  #features[f]
        labels_distribute[label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0] /= labels_distribute[0]
        distribute[f][1] /= labels_distribute[1]
    for f in distribute:
        s = sum(distribute[f])
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute = sorted(distribute.items(),
                        key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) /
                                          (sum(d[1]) + 0.1)),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
示例#37
0
def output_mention_and_count(attribute):
    from pymongo import Connection
    from my_progress_bar import progress_bar
    from collections import Counter

    mentions=get_mentions()
    mentions=dict(zip(mentions,[[0,0,0] for mention in mentions]))
    users=Connection().jd.weibo_users
    bar=progress_bar(users.count())
    values=[]
    for index,user in enumerate(users.find()):
        bar.draw(index)
        try:
            f=user['profile'][attribute].index(1)
        except:
            continue
        values.append(f)
    values=Counter(values)
    min_value=min(values.values())
    for key in values:
        values[key]=min_value*1.0/values[key]
    print values

    bar=progress_bar(users.count())
    for index,user in enumerate(users.find()):
        try:
            f=user['profile'][attribute].index(1)
        except:
            continue
        for behavior in user['behaviors']:
            for w in behavior['parsed_review']['review_general']:
                if w in mentions:
                    mentions[w][f]+=1
        bar.draw(index)
    mentions=sorted(mentions.items(),key=lambda d:sum(d[1]),reverse=True)
    print ''
    for m in mentions:
        if sum(m[1])<1000:
            break
        for i in range(len(m[1])):
            m[1][i]='%0.1f'%(m[1][i]*values[i])
        print m[0].encode('utf8'),m[1]
def output_text_matrix_from_bag_of_words():
    from pymongo import Connection
    words = {}
    f = open('./word.feature').readlines()
    for i in range(0, len(f)):
        words[f[i].decode('utf8')[0:-1]] = i
    all_data_x = []
    all_data_y = []
    index = 0
    #进度条相关参数
    users = Connection().user_profilling.users
    total_count = users.count()
    bar = get_progressive_bar(total_count)
    finish_count = 0
    #for line in open('./users.data'):
    uids = []
    for user in users.find():
        #user=parse_user(line)
        correct_status = 0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status += 1
        if correct_status < 50:
            continue
        length = []
        text = numpy.zeros((len(words)))
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                if word not in words:
                    continue
                text[words[word]] += 1.0
        if not text.any():
            continue
        text_vector = text
        if user['information']['gender'] == 'm':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index += 1
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x = numpy.array(all_data_x)
    all_data_y = numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data')
    dump_user_vector(all_data_x, all_data_y, uids, 'user_text_bag_words.data')
def output_text_matrix_from_bag_of_words():
    from pymongo import Connection
    words={}
    f=open('./word.feature').readlines()
    for i in range(0,len(f)):
        words[f[i].decode('utf8')[0:-1]]=i
    all_data_x=[]
    all_data_y=[]
    index=0
    #进度条相关参数
    users=Connection().user_profilling.users
    total_count=users.count()
    bar=get_progressive_bar(total_count)
    finish_count=0
    #for line in open('./users.data'):
    uids=[]
    for user in users.find():
        #user=parse_user(line)
        correct_status=0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status+=1
        if correct_status<50:
            continue
        length=[]
        text=numpy.zeros((len(words)))
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                if word not in words:
                    continue
                text[words[word]]+=1.0
        if not text.any():
            continue
        text_vector=text
        if user['information']['gender']=='m':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index+=1
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x=numpy.array(all_data_x)
    all_data_y=numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data')
    dump_user_vector(all_data_x,all_data_y,uids,'user_text_bag_words.data')
def output_all_shopping_items():
    users=Connection().jd.weibo_users
    all_items=[]
    bar=progress_bar(users.count())
    for index,user in enumerate(users.find()):
        for behavior in user['behaviors']:
            all_items.append(behavior['item'])
        bar.draw(index+1)
    all_items=Counter(all_items)
    all_items=sorted(all_items.items(), key=lambda d:d[1], reverse=True)
    fout=open('./review.feature','w')
    for word in all_items:
        fout.write('%d %d\n'%(word[0],word[1]))
示例#41
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'multi_clf/%s_test.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
示例#42
0
def output_features(fname, key):
    fout = open(fname, 'w')
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    features = []
    for index, user in enumerate(collection.find()):
        features += user[key]
        bar.draw(index + 1)
    features = sorted(Counter(features).items(),
                      key=lambda d: d[1],
                      reverse=True)
    fout = open('./features/review.feature', 'w')
    for f in features:
        fout.write('%s %d\n' % (f[0].encode('utf8'), f[1]))
示例#43
0
def output_review_matrix(order,feature_length=1000):
    from pymongo import Connection
    feature_map1={}
    feature_map2={}
    index1=0
    index2=0
    for line in open('./features/review.feature'):
        line=line.decode('utf8')
        line=line.replace('\n','').replace('\r','').split(':')[1].split(',')
        for feature in line:
            feature_map1[feature]=index1
            feature_map2[feature]=index2
            index2+=1
        index1+=1
    print index1
    print index2
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        for behavior in user['behaviors']:
            review=behavior['review']
            review=review['review_title']+review['review_general']
            for feature in feature_map1:
                if feature in review:
                    features.append(feature)
        if features==[]:
            continue
        if order==1:
            vector=get_one_hot_vector(features, feature_map1)
        else:
            vector=get_one_hot_vector(features, feature_map2)
        #if sum(vector)<10:
        #    continue
        #if not vector.any():
        #    continue
        #y=get_location_class(user['location'],key_map)
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_review%d'%order)
示例#44
0
class CombineTest(unittest2.TestCase):
    def setUp(self):
        self.col0 = Connection("127.0.0.1", 27017)["algolab-test"]["rg0"]
        self.col1 = Connection("127.0.0.1", 27017)["algolab-test"]["rg1"]
        self.col0.drop()
        self.col1.drop()

    def create_rg_for(self, datasets, col=None):
        for n in datasets:
            create_rg(npoints[n], col if col else self.col0, distance_function=edist)

    def test_simple_combine(self):
        self.create_rg_for([12, 13])
        anglecombine(self.col0, 20)
        self.assertEqual(self.col0.count(), 7)
        self.assertDictContainsSubset({
            0: [1, 1],
            1: [2, 1],
            2: [3.0, 1.15],
            3: [4.0, 1.15],
            4: [5.0, 1.15],
            5: [6, 1],
            6: [7, 1]}, {n["_id"]: n["loc"] for n in self.col0.find()})

    def test_combine_2switches(self):
        self.create_rg_for([12, 13, 15])
        anglecombine(self.col0, 20)
        self.assertEqual(self.col0.count(), 8)
        self.assertDictContainsSubset({
            0: [1, 1],
            1: [2, 1],
            2: [3.0, 1.15],
            3: [4.0, 1.15],
            4: [5.0, 1.15],
            5: [6, 1],
            12: [4, 4],
            6: [7, 1]}, {n["_id"]: n["loc"] for n in self.col0.find()})
示例#45
0
def statistics_after_train(attribute,method,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    labels=get_labels_after_train(attribute,method)
    print len(labels)
    collection=Connection().jd.train_users
    label_distribute=Counter(labels.values())
    balance_params=dict()
    for label in label_distribute:
        balance_params[label]=1.0*min(label_distribute.values())/label_distribute[label]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        for f in features:
            if f in distribute:
                distribute[f][label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print label_distribute
    for f in distribute:
        distribute[f][0]/=label_distribute[0]
        distribute[f][1]/=label_distribute[1]
    for f in distribute.keys():
        s=sum(distribute[f])
        if s==0:
            distribute.pop(f)
            continue
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute)
    distribute=sorted(distribute.items(),key=lambda d:max(d[1])/sum(d[1]), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
示例#46
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'multi_clf/%s_test.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        features={}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        user['mentions_1_1']={}
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        if 'user_product_vector_from_deepwalk' in user:
        #if False:
            start_index=max(all_features_1.values())+1
            for i,v in enumerate(user['user_product_vector_from_deepwalk']):
                v=abs(v)
                sorted_feature.append((i+start_index,v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
def construct_test_set(attribute):
    all_features=get_features(feature_file=feature_file_name)
    all_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=all_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=all_features_1)
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_test_uids.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue

        features=combine_features(user['mentions_0'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))

        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))

        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f],v))

        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        keys=map(lambda d:d[0], sorted_feature)
        if not len(keys)==len(set(keys)):
            print Counter(keys).values()
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%f'%f)
        fout.write('\n')
        bar.draw(index+1)
def output_all_item_classes(order):
    users=Connection().jd.weibo_users
    all_items=[]
    bar=progress_bar(users.count())
    for index,user in enumerate(users.find()):
        for behavior in user['behaviors']:
            try:
                all_items.append(behavior['item_class'][order])
            except:
                continue
        bar.draw(index+1)
    all_items=Counter(all_items)
    all_items=sorted(all_items.items(), key=lambda d:d[1], reverse=True)
    fout=open('./item_class_order_%d.feature'%order,'w')
    for word in all_items:
        fout.write('%s %d\n'%(word[0].encode('utf8'),word[1]))
示例#49
0
def statistics(attribute,threshold=-1,feature_file_name=base_dir+'/features/mention.feature',show=False):
    import random
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    distribute=dict([f,[0.,0.]] for f in all_features)
    labels_distribute=[0.,0.]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features=dict(user['mentions'])
        products=Counter(user['products'])
        for p in products:
            features[p]=products[p]
        if len(features)<10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label]+=1#features[f]
        labels_distribute[label]+=1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f])<threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0]/=labels_distribute[0]
        distribute[f][1]/=labels_distribute[1]
    for f in distribute:
        s=sum(distribute[f])
        distribute[f][0]/=s
        distribute[f][1]/=s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute=sorted(distribute.items(),key=lambda d:abs(1-2*(d[1][0]+0.1)/(sum(d[1])+0.1)), reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f'%(d[0].encode('utf8'), (d[1][0]+0.1)/(sum(d[1])+0.1),1-(d[1][0]+0.1)/(sum(d[1])+0.1),)
示例#50
0
def test(attribute):
    from pymongo import Connection
    collection=Connection().jd.test_users
    bar=progress_bar(collection.count())
    labels=dict()
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        labels[user['_id']]=label
        bar.draw(index+1)
        if index>100000:
            break
    score,feature_distribute=statistics(labels,feature_file_name=base_dir+'/features/mention.feature',threshold=20)
    for f,v in sorted(score.items(),key=lambda d:d[1],reverse=True)[:50]:
        print f,'0:%0.2f 1:%0.2f'%tuple(feature_distribute[f])
    print feature_distribute[u'同学']
def output_name_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    vectorizer = CountVectorizer(analyzer='char_wb',
                                 ngram_range=(1, 3),
                                 min_df=1)
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    uids = []
    y = []
    for user in users.find():
        #if finish_count>1000:
        #    break
        name = user['screen_name']
        normal_name = []
        for n in name:
            if n[0] in lastnames:
                normal_name.append(n[1:])
            else:
                continue
                #normal_name.append(n)
        corpus.append(' '.join(normal_name))
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender'] == 'm':
            y.append(1)
        else:
            y.append(0)
        uids.append(user['information']['uid'])
    x = vectorizer.fit_transform(corpus)
    fe = vectorizer.get_feature_names()
    for f in fe:
        print f.encode('utf8')
    all_data_x = x.toarray()
    all_data_y = numpy.array(y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data')
    dump_user_vector(all_data_x, all_data_y, uids, 'user_name_vector.data')
def plot():
    from matplotlib import pyplot as plt
    x_m = []
    y_m = []
    x_f = []
    y_f = []
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = pickle.load(open('./tf.data'))
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        if finish_count > 5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 3:
                continue
            try:
                x = 1.0 * tf[n[1]][0] / sum(tf[n[1]])
                y = 1.0 * tf[n[2]][0] / sum(tf[n[2]])
            except:
                continue
            if user['information']['gender'] == 'm':
                x_m.append(x)
                y_m.append(y)
            else:
                x_f.append(x)
                y_f.append(y)
        bar.cursor.restore()
        bar.draw(value=finish_count)
    plt.scatter(x_m, y_m, c='red', label='Male', alpha=0.3)
    plt.scatter(x_f, y_f, c='green', label='Female', alpha=0.3)
    plt.legend()
    plt.grid(True)
    plt.show()
示例#53
0
def test(attribute):
    from pymongo import Connection
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    labels = dict()
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        labels[user['_id']] = label
        bar.draw(index + 1)
        if index > 100000:
            break
    score, feature_distribute = statistics(labels,
                                           feature_file_name=base_dir +
                                           '/features/mention.feature',
                                           threshold=20)
    for f, v in sorted(score.items(), key=lambda d: d[1], reverse=True)[:50]:
        print f, '0:%0.2f 1:%0.2f' % tuple(feature_distribute[f])
    print feature_distribute[u'同学']
示例#54
0
def analyze_feature_count(attribute):
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    bar=progress_bar(collection.count())
    x=[]
    y=[]
    labels=[]
    for index,user in enumerate(collection.find()):
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        labels.append(label)
        x.append(len(user['products']))
        y.append(len(user['mentions'].values()))
        bar.draw(index)
    f=open('./tmp.data','w')
    for i in xrange(len(labels)):
        f.write('%d %d %d\n'%(labels[i],x[i],y[i]))
    print Counter(labels)
示例#55
0
def construct_train_set(labeled_features, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter(user['products'])).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1
        features = {}
        #features=user['mentions_0']
        #features=Counter(user['products'])
        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        if 'user_product_vector_from_deepwalk' in user:
            #if False:
            start_index = max(all_features_1.values()) + 1
            for i, v in enumerate(user['user_product_vector_from_deepwalk']):
                v = abs(v)
                sorted_feature.append((i + start_index, v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    print len(confidence0), len(confidence1)

    if len(confidence0) == 0 or len(confidence1) == 0:
        return False
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/labeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        labeled_train_data.write('%d %s\n' % (d[1], d[3]))

    unlabeled_train_data = StringIO.StringIO()
    labeled_train_data = open(RAW_DATA_DIR + 'multi_clf/unlabeled_train.data',
                              'w')
    for d in confidence0 + confidence1:
        unlabeled_train_data.write('%d %s\n' % (d[1], d[3]))
    return True
示例#56
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    review_featuers = get_features(
        feature_file_name=base_dir + '/features/review.feature',
        start_index=max(all_features_1.values()) + 1)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in combine_features(user['mentions'],
                                         Counter('products')).items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        if not s == 0:
            label_distributed[0] /= s
            label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        user['mentions_1_1'] = {}
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append((
            user['_id'],
            label,
            abs(label_distributed[0] - label_distributed[1]),
            str_features,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence0 = sorted(confidence0, key=lambda d: d[2], reverse=True)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    confidence1 = sorted(confidence1, key=lambda d: d[2], reverse=True)
    confidence2 = filter(lambda d: d[1] == -1, confidence)
    confidence2 = sorted(confidence2, key=lambda d: d[4], reverse=True)

    dimention = min(len(confidence0), len(confidence1), training_count / 2)
    confidence0 = confidence0[:dimention]
    confidence1 = confidence1[:dimention]
    confidence2 = confidence2[:dimention]

    fout = open(RAW_DATA_DIR + 'mylabel2trainset/%s_train.data' % attribute,
                'w')
    uid_output = open(
        RAW_DATA_DIR + 'mylabel2trainset/%s_train_uids.data' % attribute, 'w')
    for d in confidence0 + confidence1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR + 'mylabel2trainset/%s_train_unlabel.data' % attribute,
        'w')
    uid_output = open(
        RAW_DATA_DIR +
        'mylabel2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in confidence2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
示例#57
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])

    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        #print label_distributed
        #if abs(label_distributed[0]-label_distributed[1])<0.5:
        #    continue
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            label = -1
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%d' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    confidence = sorted(confidence, key=lambda d: d[2], reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)[:training_count / 2]
    confidence1 = filter(lambda d: d[1] == 1, confidence)[:training_count / 2]
    confidence_unlabel = []
    confidence_unlabel += filter(lambda d: d[1] == -1, confidence)
    #confidence_unlabel+=filter(lambda d:d[1]==0,confidence)[training_count/2:training_count*5]
    #confidence_unlabel+=filter(lambda d:d[1]==1,confidence)[training_count/2:training_count*5]
    confidence_unlabel = confidence_unlabel[:5 * training_count]
    print len(confidence0), len(confidence1)
    fout = open(self_training_file_dir + 'labeled_train_%s.data' % attribute,
                'w')
    for d in set(confidence0 + confidence1):
        fout.write('%d %s\n' % (d[1], d[3]))
    fout_unlabel = open(
        self_training_file_dir + 'unlabeled_train_%s.data' % attribute, 'w')
    for d in confidence_unlabel:
        fout_unlabel.write('%d %s\n' % (d[1], d[3]))
示例#58
0
def statistics_after_train(attribute,
                           method,
                           threshold=-1,
                           feature_file_name=base_dir +
                           '/features/mention.feature',
                           show=False):
    import random
    labels = get_labels_after_train(attribute, method)
    print len(labels)
    collection = Connection().jd.train_users
    label_distribute = Counter(labels.values())
    balance_params = dict()
    for label in label_distribute:
        balance_params[label] = 1.0 * min(
            label_distribute.values()) / label_distribute[label]
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    for index, user in enumerate(collection.find()):
        try:
            label = labels[user['_id']]
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        for f in features:
            if f in distribute:
                distribute[f][label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print label_distribute
    for f in distribute:
        distribute[f][0] /= label_distribute[0]
        distribute[f][1] /= label_distribute[1]
    for f in distribute.keys():
        s = sum(distribute[f])
        if s == 0:
            distribute.pop(f)
            continue
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute)
    distribute = sorted(distribute.items(),
                        key=lambda d: max(d[1]) / sum(d[1]),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )