def output_graph_matrix():
    from pymongo import Connection
    users=Connection().user_profilling.users
    graph=Connection().user_profilling.graph_embedding
    print graph.count()
    bar=get_progressive_bar(users.count())
    x=[]
    y=[]
    finish_count=0
    uids=[]
    for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}):
        finish_count+=1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding=graph.find_one({'_id':user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender=user['information']['gender']
        if gender=='f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x,y,uids,'user_graph_vector.data')
def output_graph_matrix():
    from pymongo import Connection
    users = Connection().user_profilling.users
    graph = Connection().user_profilling.graph_embedding
    print graph.count()
    bar = get_progressive_bar(users.count())
    x = []
    y = []
    finish_count = 0
    uids = []
    for user in users.find({'int_id': {
            '$exists': True
    }}, {
            'information': 1,
            'int_id': 1
    }):
        finish_count += 1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding = graph.find_one({'_id': user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender = user['information']['gender']
        if gender == 'f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x, y, uids, 'user_graph_vector.data')
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=pickle.load(open('./tf.data'))
    x=[]
    y=[]
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        if finish_count>5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<3:
                continue
            try:
                x0=1.0*tf[n[1]][0]/sum(tf[n[1]])
                x1=1.0*tf[n[2]][0]/sum(tf[n[2]])
            except:
                continue
            if user['information']['gender']=='m':
                y.append(1)
            else:
                y.append(0)
            x.append([x0,x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x,y,'gender_name_simple.data')
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users=Connection().user_profilling.users
    bar=get_progressive_bar(users.count())
    corpus=[]
    finish_count=0
    y=[]
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description=user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender']=='m':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x=x.toarray()
    all_data_y=numpy.array(y)
    dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
Пример #5
0
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = dict()
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 2:
                continue
            if user['information']['gender'] == 'm':
                gender = 1
            else:
                gender = 0
            for w in n[1:]:
                if w not in tf:
                    tf[w] = [0, 0]
                tf[w][gender] += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    y = []
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description = user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender'] == 'm':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x = x.toarray()
    all_data_y = numpy.array(y)
    dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = pickle.load(open('./tf.data'))
    x = []
    y = []
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        if finish_count > 5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 3:
                continue
            try:
                x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]])
                x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]])
            except:
                continue
            if user['information']['gender'] == 'm':
                y.append(1)
            else:
                y.append(0)
            x.append([x0, x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x, y, 'gender_name_simple.data')
Пример #8
0
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=dict()
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<2:
                continue
            if user['information']['gender']=='m':
                gender=1
            else:
                gender=0
            for w in n[1:]:
                if w not in tf:
                    tf[w]=[0,0]
                tf[w][gender]+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
def output_text_matrix_from_bag_of_words():
    from pymongo import Connection
    words={}
    f=open('./word.feature').readlines()
    for i in range(0,len(f)):
        words[f[i].decode('utf8')[0:-1]]=i
    all_data_x=[]
    all_data_y=[]
    index=0
    #进度条相关参数
    users=Connection().user_profilling.users
    total_count=users.count()
    bar=get_progressive_bar(total_count)
    finish_count=0
    #for line in open('./users.data'):
    uids=[]
    for user in users.find():
        #user=parse_user(line)
        correct_status=0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status+=1
        if correct_status<50:
            continue
        length=[]
        text=numpy.zeros((len(words)))
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                if word not in words:
                    continue
                text[words[word]]+=1.0
        if not text.any():
            continue
        text_vector=text
        if user['information']['gender']=='m':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index+=1
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x=numpy.array(all_data_x)
    all_data_y=numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data')
    dump_user_vector(all_data_x,all_data_y,uids,'user_text_bag_words.data')
def output_text_matrix_from_bag_of_words():
    from pymongo import Connection
    words = {}
    f = open('./word.feature').readlines()
    for i in range(0, len(f)):
        words[f[i].decode('utf8')[0:-1]] = i
    all_data_x = []
    all_data_y = []
    index = 0
    #进度条相关参数
    users = Connection().user_profilling.users
    total_count = users.count()
    bar = get_progressive_bar(total_count)
    finish_count = 0
    #for line in open('./users.data'):
    uids = []
    for user in users.find():
        #user=parse_user(line)
        correct_status = 0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status += 1
        if correct_status < 50:
            continue
        length = []
        text = numpy.zeros((len(words)))
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                if word not in words:
                    continue
                text[words[word]] += 1.0
        if not text.any():
            continue
        text_vector = text
        if user['information']['gender'] == 'm':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index += 1
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x = numpy.array(all_data_x)
    all_data_y = numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_bag_of_words.data')
    dump_user_vector(all_data_x, all_data_y, uids, 'user_text_bag_words.data')
def output_text_matrix_from_vector():
    from pymongo import Connection
    users=Connection().user_profilling.users
    word_vectors=get_vectors('/mnt/data1/adoni/word_vectors.bin')
    word_count=600
    all_data_x=[]
    all_data_y=[]
    index=0
    #进度条相关参数
    total_count=20000
    bar=get_progressive_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        correct_status=0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status+=1
        if correct_status<50:
            continue
        length=[]
        text=[]
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                try:
                    text.append(word_vectors[word])
                except Exception as e:
                    continue
        text_vector=get_text_vector_for_nn(text,window_size=2)
        if text_vector is None:
            continue
        if user['information']['gender']=='m':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index+=1
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x=numpy.array(all_data_x)
    all_data_y=numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_vector.data')
    dump_user_vector(all_data_x,all_data_y,uids,'user_text_vectors.data')
def output_text_matrix_from_vector():
    from pymongo import Connection
    users = Connection().user_profilling.users
    word_vectors = get_vectors('/mnt/data1/adoni/word_vectors.bin')
    word_count = 600
    all_data_x = []
    all_data_y = []
    index = 0
    #进度条相关参数
    total_count = 20000
    bar = get_progressive_bar(total_count)
    finish_count = 0
    uids = []
    for user in users.find():
        correct_status = 0
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            else:
                correct_status += 1
        if correct_status < 50:
            continue
        length = []
        text = []
        for status in user['statuses']:
            if is_not_good_status(status):
                continue
            for word in status['text']:
                try:
                    text.append(word_vectors[word])
                except Exception as e:
                    continue
        text_vector = get_text_vector_for_nn(text, window_size=2)
        if text_vector is None:
            continue
        if user['information']['gender'] == 'm':
            all_data_y.append(1)
        else:
            all_data_y.append(0)
        all_data_x.append(text_vector)
        uids.append(user['information']['uid'])
        index += 1
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    all_data_x = numpy.array(all_data_x)
    all_data_y = numpy.array(all_data_y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_text_vector.data')
    dump_user_vector(all_data_x, all_data_y, uids, 'user_text_vectors.data')
def output_name_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    vectorizer = CountVectorizer(analyzer='char_wb',
                                 ngram_range=(1, 3),
                                 min_df=1)
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    uids = []
    y = []
    for user in users.find():
        #if finish_count>1000:
        #    break
        name = user['screen_name']
        normal_name = []
        for n in name:
            if n[0] in lastnames:
                normal_name.append(n[1:])
            else:
                continue
                #normal_name.append(n)
        corpus.append(' '.join(normal_name))
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender'] == 'm':
            y.append(1)
        else:
            y.append(0)
        uids.append(user['information']['uid'])
    x = vectorizer.fit_transform(corpus)
    fe = vectorizer.get_feature_names()
    for f in fe:
        print f.encode('utf8')
    all_data_x = x.toarray()
    all_data_y = numpy.array(y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data')
    dump_user_vector(all_data_x, all_data_y, uids, 'user_name_vector.data')
Пример #14
0
def plot():
    from matplotlib import pyplot as plt
    x_m = []
    y_m = []
    x_f = []
    y_f = []
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = pickle.load(open('./tf.data'))
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        if finish_count > 5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 3:
                continue
            try:
                x = 1.0 * tf[n[1]][0] / sum(tf[n[1]])
                y = 1.0 * tf[n[2]][0] / sum(tf[n[2]])
            except:
                continue
            if user['information']['gender'] == 'm':
                x_m.append(x)
                y_m.append(y)
            else:
                x_f.append(x)
                y_f.append(y)
        bar.cursor.restore()
        bar.draw(value=finish_count)
    plt.scatter(x_m, y_m, c='red', label='Male', alpha=0.3)
    plt.scatter(x_f, y_f, c='green', label='Female', alpha=0.3)
    plt.legend()
    plt.grid(True)
    plt.show()
def output_name_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    vectorizer = CountVectorizer(analyzer='char_wb',ngram_range=(1,3),min_df=1)
    from pymongo import Connection
    users=Connection().user_profilling.users
    bar=get_progressive_bar(users.count())
    corpus=[]
    finish_count=0
    uids=[]
    y=[]
    for user in users.find():
        #if finish_count>1000:
        #    break
        name=user['screen_name']
        normal_name=[]
        for n in name:
            if n[0] in lastnames:
                normal_name.append(n[1:])
            else:
                continue
                #normal_name.append(n)
        corpus.append(' '.join(normal_name))
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender']=='m':
            y.append(1)
        else:
            y.append(0)
        uids.append(user['information']['uid'])
    x = vectorizer.fit_transform(corpus)
    fe=vectorizer.get_feature_names()
    for f in fe:
        print f.encode('utf8')
    all_data_x=x.toarray()
    all_data_y=numpy.array(y)
    #dump_train_valid_test(all_data_x,all_data_y,'gender_name.data')
    dump_user_vector(all_data_x,all_data_y,uids,'user_name_vector.data')
Пример #16
0
def plot():
    from matplotlib import pyplot as plt
    x_m=[]
    y_m=[]
    x_f=[]
    y_f=[]
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=pickle.load(open('./tf.data'))
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        if finish_count>5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<3:
                continue
            try:
                x=1.0*tf[n[1]][0]/sum(tf[n[1]])
                y=1.0*tf[n[2]][0]/sum(tf[n[2]])
            except:
                continue
            if user['information']['gender']=='m':
                x_m.append(x)
                y_m.append(y)
            else:
                x_f.append(x)
                y_f.append(y)
        bar.cursor.restore()
        bar.draw(value=finish_count)
    plt.scatter(x_m,y_m,c='red', label='Male',  alpha=0.3)
    plt.scatter(x_f,y_f,c='green', label='Female',  alpha=0.3)
    plt.legend()
    plt.grid(True)
    plt.show()