def test_english_recognize(self): non_eng_user = '******' eng_user = '******' test_user_set = [non_eng_user, eng_user] dataroot = "/Users/yongjoo/workspace/tweets_process/data/" inputfile = dataroot + "tweetsRetrieved-May10-AlmostVerified.jsonarr" # collect tweets here; key is screen_name, and value is tweets collectedTweets = {} for line in open(inputfile): j = json.loads(line) screen_name = j['user']['screen_name'] text = j['text'] if screen_name in test_user_set: if screen_name in collectedTweets: collectedTweets[screen_name] += " " + text else: collectedTweets[screen_name] = text # collection finished; now start test self.assertFalse(pattern.isEnglish(collectedTweets[non_eng_user]), "non eng user fail") self.assertTrue(pattern.isEnglish(collectedTweets[eng_user]), "eng user fail")
def run(): print "current directory: " + os.getcwd() dataroot = "../../../data/" # Collect tweets for each screen_name inputfile = dataroot + "retrievedTweets-Jun19-2.0.jsonarr" collectedTweets = collectTweets(inputfile) print "Completed collecting tweets: " + str(len(collectedTweets)) # Collect ages for each screen_name inputfile = dataroot + "ageEmbededTweets-Jun19-sampled2.0.json" collectedAges = collectAges(inputfile) print "Completed collecting age data: " + str(len(collectedAges)) # main data file to be written outputfile = dataroot + "age-tweets-Jun19-2.0.libsvm" out = open(outputfile, 'w') # related prior probabilities are written here prob_outputfile = dataroot + "age-tweets-prob-Jun19-2.0.csv" prob_out = open(prob_outputfile, 'w') # print out screen names finally used screen_name_file = dataroot + "screenname-matched-Jun19-2.0.txt" screenname_out = open(screen_name_file, 'w') # # Convert tweets into feature arrays # pruner = feature.Pruner() manager = feature.FeatureManager() non_english_count = 0 # object to get prob array from screen_name screenNameToProbArray = ScreenNameToProbArray() for screen_name, tweets in collectedTweets.iteritems(): if not pattern.isEnglish(tweets): non_english_count += 1 continue probArray = screenNameToProbArray.getProbArrayFor(screen_name) farr_text = manager.convertTextIntoFeatureArray(tweets) # farr_name = manager.convertFirstNameProbIntoFeatureArray(probArray) # farr_text.appendFeatureArray(farr_name) pruner.recordFeatureArray(screen_name, farr_text) pruner.recordProbArray(screen_name, probArray) print str(non_english_count) + " number of non-English tweets have been dropped." # instead of feature pruning based on number of counts, we use feature # selection based on information gain supported by mallet. # pruner.prune(5) print "Completed converting text into feature arrays" # print out a set of indexed words and their labels indexoutfile = dataroot + "age-tweets-indexedWords-Jun19-2.0.txt" indexout = open(indexoutfile, 'w') indexout.write(str(manager.indexer)) indexout.close() # Record the features in libsvm format # utilize the fact that pruner is made iterable age_group_count = {0:0, 1:0, 2:0, 3:0} for screen_name, farr in pruner: try: age = collectedAges[screen_name] except KeyError: print "Anyway retrieved key: " + screen_name continue # write feature file age_group = feature.ageToAgeGroup(age) age_group_count[age_group] += 1 out.write(str(age_group) + " ") for label, value in farr: out.write(str(label) + ":" + str(value) + " ") out.write("\n") # write name prob file prob_array = pruner.getProbArray(screen_name) for i in range(len(prob_array)): prob_out.write(str(prob_array[i])) if i != len(prob_array) - 1: prob_out.write(",") prob_out.write("\n") # write screen_name screenname_out.write(screen_name + '\n') print "Completed writing the output file to: " + outputfile print "Collected Age Groups:" for group, count in age_group_count.iteritems(): print str(group) + ": " + str(count) # Done out.close() prob_out.close() screenname_out.close() print "Done"
def run(): dataroot = "/Users/yongjoo/workspace/tweets_process/data/" # Collect tweets for each screen_name inputfile = dataroot + "tweetsRetrieved-May10-AlmostVerified.jsonarr" collectedTweets = collectTweets(inputfile) print "Completed collecting tweets: " + str(len(collectedTweets)) # Collect ages for each screen_name inputfile = dataroot + "ageEmbededTweets-May10-AlmostVerified.json" collectedAges = collectAges(inputfile) print "Completed collecting age data: " + str(len(collectedAges)) outputfile = dataroot + "age-tweets.libsvm" out = open(outputfile, 'w') # Convert tweets into feature arrays # key: screen_name # value: FeatureArray instance pruner = feature.Pruner() manager = feature.FeatureManager() for screen_name, tweets in collectedTweets.iteritems(): if not pattern.isEnglish(tweets): continue farr = manager.convertTextIntoFeatureArray(tweets) pruner.recordFeatureArray(screen_name, farr) pruner.prune(5) print "Completed converting text into feature arrays" # Record the features in libsvm format # use that pruner is made iterable for screen_name, farr in pruner: try: age = collectedAges[screen_name] except KeyError: print "Anyway retrieved key: " + screen_name continue age_group = feature.ageToAgeGroup(age) out.write(str(age_group) +" ") for label, value in farr: out.write(str(label) + ":" + str(value) + " ") out.write("\n") print "Completed writing the output file to: " + outputfile # Done out.close() print "Done"