예제 #1
0
def readToks(phrasemodel=False):
    tweets = []
    for line in open(FILE, 'r', errors='ignore'):
        tweets.append(json.loads(line))

    #tweets_on_topic = defaultdict(list)
    #for topic in TOPICS:
    #    for index, tweet in enumerate(tweets):
    #        for keyword in KEYWORDS[topic]:
    #            if keyword in tweet['text'].lower():
    #                tweets_on_topic[topic].append(index)
    #                break

    tokens_pb = Tokens()
    if phrasemodel == False:
        with open(TOKENS, "rb") as f:
            tokens_pb.ParseFromString(f.read())
    else:
        with open(TOKENSPHRASE, "rb") as f:
            tokens_pb.ParseFromString(f.read())

    tokens = []
    for token_pb in tokens_pb.tokens:
        if token_pb.count == 1:
            break
        tokens.append(token_pb.token)

    print("Reading counts for ", str(len(tokens)), "tokens")
    return tokens, tweets, tweets
예제 #2
0
def readToks2(dimension, usephrasemodel=False):

    tokens_pb = Tokens()
    if usephrasemodel == False:
        with open(TOKENS, "rb") as f:
            tokens_pb.ParseFromString(f.read())
    else:
        with open(TOKENSPHRASE, "rb") as f:
            tokens_pb.ParseFromString(f.read())

    tokens = []
    for token_pb in tokens_pb.tokens:
        if token_pb.count == 1:
            break
        tokens.append(token_pb.token)

    print("Reading counts for ", str(len(tokens)),
          "tokens, taking most frequent ", dimension)
    return tokens[:dimension]
TWEETS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/all.tweets'
TOKENS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/tokensFinal'

keywords = {'clinton': ['hillary', 'clinton'], 
            'obama' : ['barack', 'obama'],
            'climate': ['climate'],
            'feminism': ['feminism', 'feminist'],
            'abortion': ['abortion', 'aborting'],
            'atheism': ['atheism', 'atheist']
}

topics = keywords.keys()

tokens_pb = Tokens()
with open(TOKENS, "rb") as f:
    tokens_pb.ParseFromString(f.read())

tokens = []
for token_pb in tokens_pb.tokens:
    if token_pb.count == 1:
        break
    tokens.append(token_pb.token)

print (len(tokens))

sys.exit()

tweets_on_topic = defaultdict(list)
for topic in topics:
    for index, tweet in enumerate(tweets):
        for keyword in keywords[topic]: