def combine(filename1, filename2): list1 = FileFunc.read_file_into_list(filename1) list2 = FileFunc.read_file_into_list(filename2) print(list1) combinedMap = {} list1.pop(0) for e in list1: tokens = e.split(',') combinedMap[(tokens[0], tokens[1])] = 0 list2.pop(0) for e in list2: tokens = e.split(',') combinedMap[(tokens[0], tokens[1])] = 0 for e in list1: tokens = e.split(',') combinedMap[(tokens[0], tokens[1])] += int(tokens[2]) for e in list2: tokens = e.split(',') combinedMap[(tokens[0], tokens[1])] += int(tokens[2]) with open('combinedEdges.csv', 'w') as f: f.write('User1,User2,ComFreq\n') with open('combinedEdges.csv', 'a') as f: for edge in combinedMap.items(): f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
def __init__(self): os.chdir("data") # slang data from https://floatcode.wordpress.com/tag/dataset/ with open("../data/slang.txt", 'rt') as f: reader = csv.reader(f, delimiter='`', quoting=csv.QUOTE_NONE) self.slang = [row[0] for row in reader] self.intensifiers = FileFunc.read_file_into_list("../data/intensifier.txt") self.exclamations = FileFunc.read_file_into_list("../data/exclamatoryWord.txt") self.modalVerbs = FileFunc.read_file_into_list("../data/modalVerb.txt") self.pronouns = FileFunc.read_file_into_list("../data/pronoun.txt") self.emoticons = FileFunc.read_file_into_list("../data/emoticon.txt")
def split(filename): tweets = FileFunc.read_file_into_list_unicode('tweets.csv') data = [] for tweet in tweets: tmp = tweet.split(',') data.append({'user': tmp[0], 'tweet': simpleTokenize(tmp[1])}) return data
def parse(): hashtags = FileFunc.read_file_into_list('allhashtags.csv') #gather hashtags tweets = split('tweets.csv') #parse the tweet data tweets.pop(0) #remove header users = [] #collect all unique users for tweet in tweets: if tweet['user'] not in users: users.append(tweet['user']) # print(users) # print(tweets) tagdict = {} for hashtag in hashtags: #initialize lists tagdict[hashtag] = [] for tweet in tweets: #check all tweets, hashtags and map users to hashtags for hashtag in hashtags: if hashtag in tweet['tweet']: if tweet['user'] not in tagdict[hashtag]: tagdict[hashtag].append(tweet['user']) # print(tweet) #add user to hashtag dictionary edgemap = {} for hashtag in tagdict: if len(tagdict[hashtag]) > 1: tagdict[hashtag].sort() for hashtag in tagdict: if len(tagdict[hashtag]) > 1: for user1 in tagdict[hashtag]: for user2 in tagdict[hashtag]: if user1 != user2: l = sorted((user1, user2)) key = (l[0], l[1]) if key in edgemap: edgemap[key] += 1 else: edgemap[key] = 1 # print(edgemap) # print(tagdict) with open('edge.csv', 'w') as f: f.write('User1,User2,ComFreq\n') with open('edge.csv', 'a') as f: for edge in edgemap.items(): f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
if rules['repeatedCharacters'] and self.repeated_character_rule(term): return True if rules['slang'] and len(term) > 1 and term in self.slang: return True if rules['caps'] and len(term) > 3 and term.isupper(): return True return False if __name__ == '__main__': personals = PersonalTweetClassifier() os.chdir("..") rules = {'intensifiers': True, 'exclamations': True, 'modalVerbs': True, 'emoticons': True, 'repeatedCharacters': True, 'slang': True, 'caps': True} count = 0 for tweet in FileFunc.read_file_into_list("../test_data.txt"): for term in tweet.split(): if personals.filter(term, rules): print(tweet) count += 1 break print(count, "'personal' tweets found") # latent factor model http://ijcai.org/papers15/Papers/IJCAI15-322.pdf # http://dl.acm.org/citation.cfm?id=1858724 # VADER http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf