def process_text(self): text = utils.clean(self.get_tweet_text()) self.set_tweet_text(text) self.set_tweet_source(utils.parse_alink(self.get_tweet_source())) if self.translation: self.detect_language_or_translate() self.filter_text()
def output_tweets_in_file(self, result): keywords = "" points = "" words = True uid = result['user']['id'] if self.geo: if self.geowrite: if not type(result["geo"]).__name__ == 'NoneType': lat = 0.00 long = 0.00 while len(result["geo"]["coordinates"]) != 0: if len(result["geo"]["coordinates"]) == 2: lat = str(result["geo"]["coordinates"].pop(0)) else: long = str(result["geo"]["coordinates"].pop(0)) points = "%s %s" % (lat, long) if self.userwrite: user = "" if self.userfields: i = 0 ttl = len(self.userfields) for field in self.userfields: if field == 'screenname' or field == 'name': user += "\"%s\"" % result["user"][field].encode( "UTF-8") elif field == 'description' or field == 'time_zone': user += "\"%s\"" % utils.clean( result["user"][field].encode("ASCII", "ignore")) elif field == 'created_at': created = result["user"][field].encode("UTF-8") date = parse(created) created = date.strftime("%Y-%m-%d %H:%M:%S") user += "\"%s\"" % created elif not field == 'id': if result["user"][field] != "": user += "%s" % result["user"][field] else: user += "0" if i < ttl: user += "," i += 1 if self.tweetwrite: tweet = "" if self.tweetfields: i = 0 ttl = len(self.tweetfields) for field in self.tweetfields: if field == 'source': source = utils.parse_alink(result[field]) tweet += "\"%s\"" % source.encode("UTF-8") elif field == 'created_at': created = result[field].encode("UTF-8") date = parse(created) created = date.strftime("%Y-%m-%d %H:%M:%S") tweet += "\"%s\"" % created elif field == 'tokens': list(result[field]).sort() for token in result[field]: keywords += token.lower() + " " keywords = keywords.rstrip().encode("UTF-8") if keywords == "": words = False tweet += "\"%s\"" % keywords elif field == 'text': text = utils.clean(result[field]) tweet += "\"%s\"" % text.encode("UTF-8") elif field == 'retweet_count': tweet += "\"%s\"" % str( result[field]).encode("UTF-8").replace("+", "") i += 1 if i < ttl: tweet += "," if points != "": if keywords == "": field = "tokens" list(result[field]).sort() for token in result[field]: keywords += token.lower() + " " keywords = keywords.rstrip().encode("UTF-8") geo_data = "%s,\"%s\",\"%s\"" % (uid, points, keywords) self.output_data_file(self.out_geo_file, geo_data) if user != "": user_data = "%s,%s" % (uid, user) self.output_data_file(self.out_user_file, user_data) if tweet != "": tweets_data = "%s,%s" % (uid, tweet) self.output_data_file(self.out_tweets_file, tweets_data) if self.wordswrite: if words: if keywords == "": field = "tokens" list(result[field]).sort() for token in result[field]: keywords += token.lower() + " " keywords = keywords.rstrip().encode("UTF-8") words_data = "\"%s\"" % keywords self.output_data_file(self.out_words_file, words_data)
def output_tweets_in_file(self,result): keywords="" points = "" words = True uid = result['user']['id'] if self.geo: if self.geowrite: if not type(result["geo"]).__name__ == 'NoneType': lat=0.00 long=0.00 while len(result["geo"]["coordinates"]) != 0: if len(result["geo"]["coordinates"]) == 2: lat = str(result["geo"]["coordinates"].pop(0)) else: long = str(result["geo"]["coordinates"].pop(0)) points = "%s %s" % (lat,long) if self.userwrite: user = "" if self.userfields: i = 0 ttl = len(self.userfields) for field in self.userfields: if field == 'screenname' or field== 'name': user += "\"%s\"" % result["user"][field].encode("UTF-8") elif field== 'description' or field == 'time_zone': user += "\"%s\"" % utils.clean(result["user"][field].encode("ASCII","ignore")) elif field =='created_at': created = result["user"][field].encode("UTF-8") date = parse(created) created = date.strftime("%Y-%m-%d %H:%M:%S") user += "\"%s\"" % created elif not field == 'id': if result["user"][field] != "": user += "%s" % result["user"][field] else: user += "0" if i < ttl: user += "," i += 1 if self.tweetwrite: tweet = "" if self.tweetfields: i = 0 ttl = len(self.tweetfields) for field in self.tweetfields: if field == 'source': source = utils.parse_alink(result[field]) tweet += "\"%s\"" % source.encode("UTF-8") elif field == 'created_at': created = result[field].encode("UTF-8") date = parse(created) created = date.strftime("%Y-%m-%d %H:%M:%S") tweet += "\"%s\"" % created elif field == 'tokens': list(result[field]).sort() for token in result[field]: keywords += token.lower()+" " keywords = keywords.rstrip().encode("UTF-8") if keywords == "": words = False tweet += "\"%s\"" % keywords elif field == 'text': text = utils.clean(result[field]) tweet += "\"%s\"" % text.encode("UTF-8") elif field == 'retweet_count': tweet += "\"%s\"" % str(result[field]).encode("UTF-8").replace("+","") i += 1 if i < ttl: tweet += "," if points != "": if keywords == "": field = "tokens" list(result[field]).sort() for token in result[field]: keywords += token.lower()+" " keywords = keywords.rstrip().encode("UTF-8") geo_data = "%s,\"%s\",\"%s\"" % (uid,points,keywords) self.output_data_file(self.out_geo_file, geo_data) if user != "": user_data = "%s,%s" % (uid,user) self.output_data_file(self.out_user_file, user_data) if tweet != "": tweets_data = "%s,%s" % (uid,tweet) self.output_data_file(self.out_tweets_file, tweets_data) if self.wordswrite: if words: if keywords == "": field = "tokens" list(result[field]).sort() for token in result[field]: keywords += token.lower()+" " keywords = keywords.rstrip().encode("UTF-8") words_data = "\"%s\"" % keywords self.output_data_file(self.out_words_file,words_data)