def run_group(): file_dir = "/home/pyongjoo/workspace/tweetsprocess/data/name-feature/" infile = file_dir + "screename-May10-AlmostVerified.csv" print infile csvreader = csv.reader(open(infile, 'rb')) conf_matrix = [] for i in range(4): conf_matrix.append([0,0,0,0]) for row in csvreader: screen_name = row[0] fullname = row[1] age = row[2] firstname = (fullname.split(' '))[0] age_group = babyname.ageToAgeGroup(age) group_prob = babyname.probHashInGroupForName(firstname) predicted_group = babyname.maxLikelyGroupForName(firstname) print (age_group, predicted_group, group_prob[0], group_prob[1], group_prob[2], group_prob[3]) if predicted_group != -1: conf_matrix[age_group][predicted_group] += 1 print "Confusion Matrix:" for i in range(len(conf_matrix)): for j in range(len(conf_matrix[0])): sys.stdout.write(str(conf_matrix[i][j]) + " ") print
def run_group(): file_dir = "/home/pyongjoo/workspace/twitter-research/data/" infile = file_dir + "ageEmbededTweets-Jun19-sampled2.0.json" print infile document = json.loads(open(infile).read()) conf_matrix = [] for i in range(4): conf_matrix.append([0,0,0,0]) libsvmoutfile = file_dir + "prob-Jun19.libsvm" libsvmout = open(libsvmoutfile, 'w') for tweetDoc in document: fullname = tweetDoc['user']['name'] age = tweetDoc['user']['age'] firstname = (fullname.split(' '))[0] firstname = firstname.encode('ascii', 'ignore') age_group = babyname.ageToAgeGroup(age) prob_hash = babyname.probHashInGroupForName(firstname) prob_array = [prob_hash[k] for k in sorted(prob_hash.keys())] predicted_group = (-1 if sum(prob_hash.values()) == 0 else prob_array.index(max(prob_array))) if predicted_group != -1: libsvmout.write(str(age_group) + ' ') for group, prob in prob_hash.iteritems(): libsvmout.write(str(group) + ':' + str(prob) + ' ') libsvmout.write('\n') # add to confusion matrix if predicted_group != -1: conf_matrix[age_group][predicted_group] += 1 libsvmout.close() print "Confusion Matrix:" for i in range(len(conf_matrix)): for j in range(len(conf_matrix[0])): sys.stdout.write(str(conf_matrix[i][j]) + " ") print
def getProbArrayFor(self, screen_name): try: firstname = self.screenNameToFirstName[screen_name] probHash = babyname.probHashInGroupForName(firstname) probArray = [] for i in range(4): probArray.append(probHash[i]) #if (probArray[0] == probArray[1] and # probArray[1] == probArray[2] and # probArray[2] == probArray[3]): # probArray.append(1.0) return probArray except KeyError: return [0, 0, 0, 0]
def procedure2(): ''' TESTING THE FIRST NAME BY COMBINING NEIGHBOR NODES (prodecure2) 1. Retrieve central nodes by selecting edges.this and removing duplicates. 2. Get the age and name info of those guys by joining with users table and selecting appropriate column. 3. Get name info of neighbors by joining with users table and selecting with `this` column for each central node. After this we can randomly select up to 20 neighbors to reduce the computation issue. 4. Run the first name system on both central nodes and their neighbors, and apply several custom aggregate functions. a. simple average of all the sums over the probabilities of every group. b. weighted average. c. majority vote. ''' # Store pairs of name and age group for central nodes. # Most of the complicated operations are handled in db, and we only use # the name and the corresponding age info from the result set. centralNodes = [] # Store the names of neighbors for every central nodes. # Key: user_id (of central node), Value: names of neighbors up to 20 neighborNames = {} con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo') with con: cur = con.cursor() # First thing to do is retrieving central nodes cur.execute('''SELECT DISTINCT e.this, u.age, u.name FROM edges e INNER JOIN users u ON e.this = u.user_id ''') numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() name = row[2] age = row[1] node_id = row[0] ageGroup = ageToAgeGroup(age) centralNodes.append([node_id, name, ageGroup]) print "Finished collecting central nodes." def getNeighborsNames(node_id): '''Convenience method to retrieve an array of names of neighbors for the specific node passed as a parameter. TODO: may add an English filter later.''' # TODO: may use random sample instead of limiting. Limiting only # selects the first set of id's which are smallest id's in the set. cur.execute('''SELECT e.this, e.that, u.name, u.age FROM edges e INNER JOIN users u ON e.that = u.user_id WHERE this = %s LIMIT 80 ''', (node_id)) rowcount = int(cur.rowcount) rows = cur.fetchall() name_arr = [rows[i][2] for i in range(rowcount)] return name_arr # end of getNeighborsNames() # Secondly, we retrieve neighbors for every central nodes and save them # into the dict # in order to overcome to slow speed of generating the prob_array for all # the neighbors, it is made for us to choose a mode to operate. the normal # mode is when set to 'gen', but once the prob_array_dict is generated, we # can load the contents by reading a file later. prob_array_mode = 'gen' # we skip this step if we will load the probability array from a file. if prob_array_mode == 'gen': for triplet in centralNodes: node = triplet[0] neighbor_names = getNeighborsNames(node) neighborNames[node] = neighbor_names print "Finished collecting neighbor nodes." # end of using mysql connection # For the sake of prediction and testing, we gather probability array # for every central nodes. The data structure is like the following: # # { node0 : [prob_array, prob_array, ...] # node1 : [prob_array, prob_array, ...] # ... } # # The `prob_array` is an array of doubles of length 4; an example is # [0.2, 0.2, 0.3, 0.3]. The first prob_array for every array of # prob_arrays is the probabilities obained from the central node itself, # while following prob_arrays are obtained from neighbors. We may weight # differently between the prob_array from a central node and prob_arrays # from neighbors, but we do not differentiate between prob_arrays from # neighbors. i.e., they are all assigned the equal weights when # aggregating. prob_array_dict = {} if prob_array_mode == 'gen': for [node_id, name, ageGroup] in centralNodes: # Init the slot prob_array_dict[node_id] = [] # We create a list of names where the first element is my name and # the followings are neighbors' names. all_names = neighborNames[node_id] all_names.insert(0, name) for name in all_names: firstname = name.split(' ')[0] prob_hash = probHashInGroupForName(firstname) prob_array = [prob_hash[i] for i in range(4)] prob_array_dict[node_id].append(prob_array) print "Finished getting prob_arrays." probfile = 'prob_array_dict.json' probout = open(probfile, 'w') probout.write(json.dumps(prob_array_dict)) probout.close() print "Wrote the prob_array_dict info to the file " + probfile else: probfile = 'prob_array_dict.json' probin = open(probfile) prob_array_dict_str = json.loads(probin.read()) for key, value in prob_array_dict_str.iteritems(): prob_array_dict[int(key)] = value probin.close() print "Read the prob_array_dict info from the file " + probfile # Probably the final step is to aggregate the prob_arrays collected for # each of central nodes. We will use the map built-in function to # aggregate the probabilities, and we need aggregation functions that # work on a list of prob_arrays and returns a predicted age group. # define more aggregator functions here. def average_aggregator(prob_array_list): def array_sum(x, y): return [x[i] + y[i] for i in range(len(x))] summed_array = reduce(array_sum, prob_array_list) return (summed_array.index(max(summed_array)) if sum(summed_array) != 0 else -1) def local_aggregator(prob_array_list): local = prob_array_list[0] return (local.index(max(local)) if sum(local) != 0 else [-1, local]) def majority_vote_aggregator(prob_array_list): predicted = [] for prob_array in prob_array_list: if sum(prob_array) != 0: predicted.append(prob_array.index(max(prob_array))) return (max(set(predicted), key = predicted.count) if len(predicted) != 0 else -1) def weighted_average_aggregator(prob_array_list): weight = 2.0 def array_sum(x, y): return [x[i] + y[i] for i in range(len(x))] def weighted_array_sum(x, y, weight): return [x[i] * weight + y[i] for i in range(len(x))] summed_array = reduce(array_sum, prob_array_list) summed_array = weighted_array_sum(prob_array_list[0], summed_array, weight) return (summed_array.index(max(summed_array)) if sum(summed_array) != 0 else -1) # using factory pattern to return a function that uses a specific # aggregator def reducer_factory(aggregator): def reducer(t): '''Key of t is node_id, and the value is the list of prob_array''' key = t[0] # must be a node_id value = t[1] # must be a list of prob_array aggregated = aggregator(value) return (key, aggregated) return reducer # predicted_dict has a following structure: # { node0: predicted age group, # node1: predicted age group, # ... } predicted_dict = dict(map(reducer_factory(average_aggregator), prob_array_dict.iteritems())) print "Finished generating predicted age groups." # Now validate the result against the true values contained in the # variable `centralNodes` # the number of cases where the db does not hold the first name. non_predictable_count = 0 # confusion matrix confusion_mat = [] for i in range(4): confusion_mat.append([0, 0, 0, 0]) for [node_id, name, ageGroup] in centralNodes: firstname = name.split(' ')[0] predictGroup = predicted_dict[node_id] if predictGroup == -1: non_predictable_count += 1 else: confusion_mat[ageGroup][predictGroup] += 1 # Report the result # report the accuracy nu = sum([confusion_mat[i][i] for i in range(4)]) denom = sum([confusion_mat[i][j] for i in range(4) for j in range(4)]) accuracy = float(nu) / float(denom) real_accuracy = (float(nu + non_predictable_count * 0.25) / float(denom + non_predictable_count)) print "Total examples: " + str(len(centralNodes)) print "Accuracy: " + str(accuracy) print "Real accuracy: " + str(real_accuracy) # report the confusion matrix print "Confusion Matrix:" for i in range(4): for j in range(4): sys.stdout.write(str(confusion_mat[i][j]) + ' ') print
def run(): # Set the output location, and open the data file. datadir = "/home/pyongjoo/workspace/twitter-research/data/" datafile = datadir + "features-Jun19-5n-small.nf" out = open(datafile, 'w') con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo'); with con: cur = con.cursor() # Retrieves the central nodes by connecting to MySQL # In order to apply the English filter, we perform join with users # table. The way to apply the English filter is described in # `engFlag.py`. centralNodes = [] cur.execute('''SELECT DISTINCT e.this FROM edges e INNER JOIN users u ON e.this = u.user_id WHERE u.eng = 1''') numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() centralNodes.append(row[0]) # perform random sampling to limit the size of the dataset. climit = 500 if len(centralNodes) > climit: centralNodes = random.sample(centralNodes, climit) print "Finished collecting central nodes." # Based on the central nodes retrieved, we build a dictionary # representation of the star-shaped networks # We (randomly) sample at most 20 neighbors for each central users # The collected dictionary looks like: # { # user_id: [...] # user_id: [...] # } # # As above, we also perform join to apply the English filter. neighborDic = {} nlimit = 5 for node in centralNodes: neighbors = [] cur.execute('''SELECT e.this, e.that FROM edges e INNER JOIN users u ON e.that = u.user_id WHERE e.this = %s AND u.eng = 1 LIMIT %s''', (node, nlimit)) numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() friend = row[1] neighbors.append(friend) neighborDic[node] = neighbors print "Finished collecting neighbor nodes." # We should do two things here. # # First is to convert the text (tweets) stored in mysql into feature # array and print them into the data file. Both of the observed nodes # and their friends (effectively hidden nodes) should be written to # the file. # # Second as we are writing the feature representation into the file, # we also record user_id to get the index (or order) that the node is # written. This is to construct the edges section of the data file later. # In the edges section, we need converted line order instead of the # user_id, the original identifier in the `neighborDic`. featureManager = FeatureManager() convertedNeighborDic = {} datalineNum = 0 nodeIdToLineNum = {} # nodeIdToLineNum[node_id] = line number nodeIdWritten = [] # record written node_id in order # first write central nodes out.write("#observed " + str(len(centralNodes)) + "\n") for node in centralNodes: # register for indexing and locate this node later convertedNeighborDic[datalineNum] = [] nodeIdToLineNum[node] = datalineNum # get the text from mysql [text, age] = getTextAndAgeWithUserId(node) age_group = ageToAgeGroup(age) # output class out.write(str(age_group)) # output labels and values farr = featureManager.convertTextIntoFeatureArray(text) for pair in farr: flabel = pair[0] fvalue = pair[1] out.write(" " + str(flabel) + ":" + str(fvalue)) out.write('\n') datalineNum += 1 nodeIdWritten.append(node) print "Finished writing central nodes." # write friends nodes after counting the total number of friends numFriends = 0 for node in centralNodes: numFriends += len(neighborDic[node]) out.write("#hidden " + str(numFriends) + "\n") for node in centralNodes: for friend in neighborDic[node]: # get the text from mysql [text, age] = getTextAndAgeWithUserId(friend) # output a dummy class out.write(str(0)) # output labels and values farr = featureManager.convertTextIntoFeatureArray(text) for pair in farr: flabel = pair[0] fvalue = pair[1] out.write(" " + str(flabel) + ":" + str(fvalue)) out.write('\n') # register in the converted neighbor dictionary convertedNeighborDic[nodeIdToLineNum[node]].append(datalineNum) datalineNum += 1 nodeIdWritten.append(friend) print "Finished writing friends node." # Next section is to write edges collected based on the line numbers # This is simply printing out the contents of `convertedNeighborDic` # in the following fashion: # # [this node] [that node] # ... # # Different from dictionary representation, we write the contents in # relational format to make for the C++ code process easily. out.write("#edges " + str(numFriends) + "\n") for node, friend_arr in convertedNeighborDic.iteritems(): for friend in friend_arr: out.write(str(node) + " " + str(friend) + "\n") print "Finished writing edges." # From the first name (almost guessed, but with high probability) we # can restore the prior probability to some extent. We first retrieve # the first name from the database, and use the baby name statistics # to get the probability. out.write("#prior " + str(len(nodeIdWritten)) + "\n") for node in nodeIdWritten: cur.execute("SELECT name FROM users WHERE user_id = %s", (node)) row = cur.fetchall()[0] firstname = row[0].split(' ')[0] prob_hash = probHashInGroupForName(firstname) for i in range(4): out.write(str(prob_hash[i]) + " ") out.write("\n") print "Finished writing name probability." ## end of `with con:` (no longer use database) # Close the data file to safely store the result. out.close()