예제 #1
0
def run_group():
    file_dir = "/home/pyongjoo/workspace/tweetsprocess/data/name-feature/"
    infile = file_dir + "screename-May10-AlmostVerified.csv"
    
    print infile
    
    csvreader = csv.reader(open(infile, 'rb'))

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])
    
    for row in csvreader:
        screen_name = row[0]
        fullname = row[1]
        age = row[2]
    
        firstname = (fullname.split(' '))[0]
        age_group = babyname.ageToAgeGroup(age)
        
        group_prob = babyname.probHashInGroupForName(firstname)
        
        predicted_group = babyname.maxLikelyGroupForName(firstname)

        print (age_group, predicted_group,
               group_prob[0], group_prob[1], group_prob[2], group_prob[3])

        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print
예제 #2
0
def run_year():

    file_dir = "/home/pyongjoo/workspace/tweetsprocess/data/name-feature/"
    infile = file_dir + "screename-May10-AlmostVerified.csv"

    print infile

    csvreader = csv.reader(open(infile, 'rb'))

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])

    for row in csvreader:
        screen_name = row[0]
        fullname = row[1]
        age = row[2]

        firstname = (fullname.split(' '))[0]
        predicted_year = babyname.maxLikelyYearForName(firstname)
        predicted_age = 2012 - predicted_year

        age_group = babyname.ageToAgeGroup(age)

        predicted_group = -1

        try:
            predicted_group = babyname.ageToAgeGroup(predicted_age)
        except Exception:
            pass

        print predicted_year, predicted_group

        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print
예제 #3
0
def run_group():
    file_dir = "/home/pyongjoo/workspace/twitter-research/data/"
    infile = file_dir + "ageEmbededTweets-Jun19-sampled2.0.json"
    print infile

    document = json.loads(open(infile).read())

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])

    libsvmoutfile = file_dir + "prob-Jun19.libsvm"
    libsvmout = open(libsvmoutfile, 'w')

    for tweetDoc in document:
        fullname = tweetDoc['user']['name']
        age = tweetDoc['user']['age']

        firstname = (fullname.split(' '))[0]
        firstname = firstname.encode('ascii', 'ignore')
        age_group = babyname.ageToAgeGroup(age)
        prob_hash = babyname.probHashInGroupForName(firstname)
        prob_array = [prob_hash[k] for k in sorted(prob_hash.keys())]
        predicted_group = (-1 if sum(prob_hash.values()) == 0
                else prob_array.index(max(prob_array)))

        if predicted_group != -1:
            libsvmout.write(str(age_group) + ' ')
            for group, prob in prob_hash.iteritems():
                libsvmout.write(str(group) + ':' + str(prob) + ' ')
            libsvmout.write('\n')

        # add to confusion matrix
        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    libsvmout.close()


    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print