示例#1
0
def combine(filename1, filename2):
    list1 = FileFunc.read_file_into_list(filename1)

    list2 = FileFunc.read_file_into_list(filename2)

    print(list1)

    combinedMap = {}

    list1.pop(0)

    for e in list1:
        tokens = e.split(',')
        combinedMap[(tokens[0], tokens[1])] = 0

    list2.pop(0)
    for e in list2:
        tokens = e.split(',')
        combinedMap[(tokens[0], tokens[1])] = 0


    for e in list1:
        tokens = e.split(',')
        combinedMap[(tokens[0], tokens[1])] += int(tokens[2])

    for e in list2:
        tokens = e.split(',')
        combinedMap[(tokens[0], tokens[1])] += int(tokens[2])

    with open('combinedEdges.csv', 'w') as f:
        f.write('User1,User2,ComFreq\n')

    with open('combinedEdges.csv', 'a') as f:
        for edge in combinedMap.items():
            f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
    def __init__(self):
        os.chdir("data")

        # slang data from https://floatcode.wordpress.com/tag/dataset/
        with open("../data/slang.txt", 'rt') as f:
            reader = csv.reader(f, delimiter='`', quoting=csv.QUOTE_NONE)
            self.slang = [row[0] for row in reader]

        self.intensifiers = FileFunc.read_file_into_list("../data/intensifier.txt")
        self.exclamations = FileFunc.read_file_into_list("../data/exclamatoryWord.txt")
        self.modalVerbs = FileFunc.read_file_into_list("../data/modalVerb.txt")
        self.pronouns = FileFunc.read_file_into_list("../data/pronoun.txt")
        self.emoticons = FileFunc.read_file_into_list("../data/emoticon.txt")
示例#3
0
def split(filename):
    tweets = FileFunc.read_file_into_list_unicode('tweets.csv')

    data = []
    for tweet in tweets:
        tmp = tweet.split(',')
        data.append({'user': tmp[0], 'tweet': simpleTokenize(tmp[1])})

    return data
示例#4
0
def parse():
    hashtags = FileFunc.read_file_into_list('allhashtags.csv') #gather hashtags

    tweets = split('tweets.csv') #parse the tweet data
    tweets.pop(0) #remove header


    users = [] #collect all unique users
    for tweet in tweets:
        if tweet['user'] not in users:
            users.append(tweet['user'])
    # print(users)
    # print(tweets)

    tagdict = {}
    for hashtag in hashtags: #initialize lists
        tagdict[hashtag] = []

    for tweet in tweets: #check all tweets, hashtags and map users to hashtags
        for hashtag in hashtags:
            if hashtag in tweet['tweet']:
                if tweet['user'] not in tagdict[hashtag]:
                    tagdict[hashtag].append(tweet['user'])
                # print(tweet) #add user to hashtag dictionary

    edgemap = {}

    for hashtag in tagdict:
        if len(tagdict[hashtag]) > 1:
            tagdict[hashtag].sort()

    for hashtag in tagdict:
        if len(tagdict[hashtag]) > 1:

            for user1 in tagdict[hashtag]:
                for user2 in tagdict[hashtag]:
                    if user1 != user2:
                        l = sorted((user1, user2))
                        key = (l[0], l[1])
                        if key in edgemap:
                            edgemap[key] += 1
                        else:
                            edgemap[key] = 1

    # print(edgemap)
    # print(tagdict)
    with open('edge.csv', 'w') as f:
         f.write('User1,User2,ComFreq\n')

    with open('edge.csv', 'a') as f:
        for edge in edgemap.items():
            f.write(str(edge[0][0]) + ',' + str(edge[0][1]) + ',' + str(edge[1]) + '\n')
        if rules['repeatedCharacters'] and self.repeated_character_rule(term):
            return True

        if rules['slang'] and len(term) > 1 and term in self.slang:
            return True

        if rules['caps'] and len(term) > 3 and term.isupper():
            return True

        return False

if __name__ == '__main__':
    personals = PersonalTweetClassifier()
    os.chdir("..")

    rules = {'intensifiers': True, 'exclamations': True, 'modalVerbs': True, 'emoticons': True, 'repeatedCharacters': True, 'slang': True, 'caps': True}
    count = 0

    for tweet in FileFunc.read_file_into_list("../test_data.txt"):
        for term in tweet.split():
            if personals.filter(term, rules):
                print(tweet)
                count += 1
                break
                
    print(count, "'personal' tweets found")

# latent factor model http://ijcai.org/papers15/Papers/IJCAI15-322.pdf
# http://dl.acm.org/citation.cfm?id=1858724
# VADER http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf