def readToks(phrasemodel=False): tweets = [] for line in open(FILE, 'r', errors='ignore'): tweets.append(json.loads(line)) #tweets_on_topic = defaultdict(list) #for topic in TOPICS: # for index, tweet in enumerate(tweets): # for keyword in KEYWORDS[topic]: # if keyword in tweet['text'].lower(): # tweets_on_topic[topic].append(index) # break tokens_pb = Tokens() if phrasemodel == False: with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) else: with open(TOKENSPHRASE, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print("Reading counts for ", str(len(tokens)), "tokens") return tokens, tweets, tweets
def readToks2(dimension, usephrasemodel=False): tokens_pb = Tokens() if usephrasemodel == False: with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) else: with open(TOKENSPHRASE, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print("Reading counts for ", str(len(tokens)), "tokens, taking most frequent ", dimension) return tokens[:dimension]
TWEETS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/all.tweets' TOKENS = 'C:/Users/Damilola/Documents/MSC UI/THESIS/STANCE DETECTION/IMPLEMENTATION WITH SPYDER/tokenised/tokensFinal' keywords = {'clinton': ['hillary', 'clinton'], 'obama' : ['barack', 'obama'], 'climate': ['climate'], 'feminism': ['feminism', 'feminist'], 'abortion': ['abortion', 'aborting'], 'atheism': ['atheism', 'atheist'] } topics = keywords.keys() tokens_pb = Tokens() with open(TOKENS, "rb") as f: tokens_pb.ParseFromString(f.read()) tokens = [] for token_pb in tokens_pb.tokens: if token_pb.count == 1: break tokens.append(token_pb.token) print (len(tokens)) sys.exit() tweets_on_topic = defaultdict(list) for topic in topics: for index, tweet in enumerate(tweets): for keyword in keywords[topic]: