示例#1
0
def run_group():
    file_dir = "/home/pyongjoo/workspace/tweetsprocess/data/name-feature/"
    infile = file_dir + "screename-May10-AlmostVerified.csv"
    
    print infile
    
    csvreader = csv.reader(open(infile, 'rb'))

    conf_matrix = []
    for i in range(4):
        conf_matrix.append([0,0,0,0])
    
    for row in csvreader:
        screen_name = row[0]
        fullname = row[1]
        age = row[2]
    
        firstname = (fullname.split(' '))[0]
        age_group = babyname.ageToAgeGroup(age)
        
        group_prob = babyname.probHashInGroupForName(firstname)
        
        predicted_group = babyname.maxLikelyGroupForName(firstname)

        print (age_group, predicted_group,
               group_prob[0], group_prob[1], group_prob[2], group_prob[3])

        if predicted_group != -1:
            conf_matrix[age_group][predicted_group] += 1

    print "Confusion Matrix:"
    for i in range(len(conf_matrix)):
        for j in range(len(conf_matrix[0])):
            sys.stdout.write(str(conf_matrix[i][j]) + " ")
        print
def procedure1():
    '''
    TESTING THE FIRST NAME ON THE CENTRAL NODES (procedure1)

    1. Retrieve central nodes by selecting edges.this and removing duplicates.

    2. Get the age and name info of those guys by joining with users table and
    selecting appropriate column.

    3. Run the first name system to get the prediction, match the predicted
    group with the real age group.

    tested on 9pm Jun 26. working fine. maybe I can add an English filter.
    '''
    # Store pairs of name and age group for central nodes.
    # Most of the complicated operations are handled in db, and we only use
    # the name and the corresponding age info from the result set.
    centralNodes = []

    con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo')

    with con:
        cur = con.cursor()
        cur.execute('''SELECT DISTINCT e.this, u.age, u.name
            FROM edges e
            INNER JOIN users u
            ON e.this = u.user_id
            ''')
        numrows = int(cur.rowcount)
        for i in range(numrows):
            row = cur.fetchone()
            name = row[2]
            age = row[1]
            ageGroup = ageToAgeGroup(age)
            centralNodes.append([name, ageGroup])


    # Validate

    # the number of cases where the db does not hold the first name.
    non_predictable_count = 0

    # confusion matrix
    confusion_mat = []
    for i in range(4):
        confusion_mat.append([0, 0, 0, 0])

    for [name, ageGroup] in centralNodes:
        firstname = name.split(' ')[0]
        predictGroup = maxLikelyGroupForName(firstname)

        if predictGroup == -1:
            non_predictable_count += 1
        else:
            confusion_mat[ageGroup][predictGroup] += 1


    # Report the result

    # report the accuracy
    nu = sum([confusion_mat[i][i] for i in range(4)])
    denom = sum([confusion_mat[i][j] for i in range(4) for j in range(4)])
    accuracy = float(nu) / float(denom)
    real_accuracy = (float(nu + non_predictable_count * 0.25) /
            float(denom + non_predictable_count))
    print "Accuracy: " + str(accuracy)
    print "Real accuracy: " + str(real_accuracy)

    # report the confusion matrix
    print "Confusion Matrix:"
    for i in range(4):
        for j in range(4):
            sys.stdout.write(str(confusion_mat[i][j]) + ' ')
        print