def gen_tags(tweet, users, geo=None): if not geo: geo = geocode.Geo() tags = [] city, country = geo.geo_normalize(tweet, users) ctag = None if city: tags += ["city:" + city] if country: ctag = "country:" + country tags += [ctag] text = utils.normalize_str(tweet['interaction_content']).split(" ") for word in text: for f in flu_words: if f == "tos" and word == f: tags += [f] elif Levenshtein.ratio(word, f) >= .75: tags += [f] tags = list(set(tags)) if ctag: tags += [tag + " " + ctag for tag in tags if not tag.startswith("country:")] return tags
def resolve_conflict(mena_geo, la_geo): if (not isempty(la_geo['city']) and normalize_str(la_geo['city']) == normalize_str(mena_geo['country'])): return mena_geo la_policy_num = LA_POLICY.index(la_geo['source'].split(",")[0]) mena_policy_num = MENA_POLICY.index(mena_geo['source'].split(",")[0]) la_policy_weight = round(float(la_policy_num) / float(len(LA_POLICY)), 1) mena_policy_weight = round(float(mena_policy_num) / float(len(MENA_POLICY)), 1) if la_policy_weight == mena_policy_weight: # Most the coarser level warning is the best or the warning with the higher population if isempty(la_geo['city']) and not isempty(mena_geo['city']): return la_geo elif isempty(mena_geo['city']) and not isempty(la_geo['city']): return mena_geo else: return la_geo else: if la_policy_weight < mena_policy_weight: return la_geo else: return mena_geo
def resolve_conflict(mena_geo, la_geo): if (not isempty(la_geo['city']) and normalize_str(la_geo['city']) == normalize_str(mena_geo['country'])): return mena_geo la_policy_num = LA_POLICY.index(la_geo['source'].split(",")[0]) mena_policy_num = MENA_POLICY.index(mena_geo['source'].split(",")[0]) la_policy_weight = round(float(la_policy_num) / float(len(LA_POLICY)), 1) mena_policy_weight = round( float(mena_policy_num) / float(len(MENA_POLICY)), 1) if la_policy_weight == mena_policy_weight: # Most the coarser level warning is the best or the warning with the higher population if isempty(la_geo['city']) and not isempty(mena_geo['city']): return la_geo elif isempty(mena_geo['city']) and not isempty(la_geo['city']): return mena_geo else: return la_geo else: if la_policy_weight < mena_policy_weight: return la_geo else: return mena_geo
def processTweet(self, jsonTweet): try: text = jsonTweet["interaction"]["content"] text = re.sub("[^A-Za-z_@#0-9]", " ", normalize_str(text)) searchResult = self.rePattern.findall(text) candidateList = [] if searchResult: self.tweetList.append(jsonTweet["embersId"]) for term in searchResult: for candidate in self.aliasJson[term]: candidateList.append(candidate) log.debug("matched aliases---> %s" % candidateList) return set(candidateList) except Exception, e: raise e
def execute(arg): logs.init(arg) fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") tweetFolder = arg.tweetFolder country = arg.country hashTagCounts = {} uids = {} # loading twitter handles from a file with open(arg.seedFile, 'r') as _file: for line in _file: handle, candidate = line.strip().split(',') if candidate not in uids: uids[candidate] = [] hashTagCounts[candidate] = {} uids[candidate].append(handle.lower()) else: uids[candidate].append(handle.lower()) # for geolocation geo = Geo() for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): userId, realName = None, None if 'twiiter' in jsonTweet: if 'user' in jsonTweet['twitter']: if 'screen_name' in jsonTweet['twitter']['user']: userId = jsonTweet['twitter']['user']['screen_name'].lower() if 'name' in jsonTweet['twitter']['user']: realName = jsonTweet['twitter']['user']['name'].lower() if userId is None and realName is None: continue log.debug('userId or realName is not None') candidate = getCandidate(userId, realName, uids) if candidate is not None: log.debug('found candidate--> ' + candidate) # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] for hashTag in hashTags: if hashTag.startswith('#'): hashTag = hashTag[1:] if hashTag in hashTagCounts[candidate]: hashTagCounts[candidate][hashTag] += 1 else: hashTagCounts[candidate][hashTag] = 1 except Exception, e: log.exception('error processing tweet %s' % e) except Exception, f: log.exception('error processing file %s' % f) else: log.debug('skipping file %s ' % _file)
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country): log.info("inside preProcess") log.debug("fromDate-->" + fromDate.strftime("%d %b %Y")) log.debug("toDate-->" + toDate.strftime("%d %b %Y")) tweets = {} # output files tweetedFile = open(outputFolder + '/tweeted.csv', 'w') mentionFile = open(outputFolder + '/mentioned.csv', 'w') # retweetFile = open(outputFolder + '/retweet.csv', 'w') wordsFile = open(outputFolder + '/containsWord.csv', 'w') sentimentFile = open(outputFolder + '/sentiment.csv', 'w') tweetsFile = open(outputFolder + '/tweets.json', 'w') # build stop word list # englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')] # spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')] # stopWordList = [] # stopWordList.extend(englishStopWords) # stopWordList.extend(spanishStopWords) log.info("# of keywords: " + str(len(keywordList))) log.info("tracking--> " + str(keywordList)) # build regular expression for keyword keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList), flags=re.IGNORECASE) # for geocoding tweets geo = Geo() tweetCount, tweetErrorCount = 0, 0 for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces keywordsPresent = re.findall(keywordRegex, text) keywordsPresent = list(set(keywordsPresent)) if len(keywordsPresent) > 0: tweetId = jsonTweet["twitter"]["id"] tweeterId = str(jsonTweet["interaction"]["author"]["id"]) mentions = getInteractions(jsonTweet) sentiment = getSentiment(jsonTweet) hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] #hashTags.extend(keywordsPresent) if len(hashTags) == 0: continue hashTags = list(set(hashTags).union(set(keywordsPresent))) tweetedFile.write(tweeterId + ',' + tweetId + '\n') sentimentFile.write(tweetId + ',' + str(sentiment) + '\n') for userId in mentions: mentionFile.write(tweetId + ',' + userId + '\n') # for userId in retweets: # retweetFile.write(tweetId + ',' + userId + '\n') for hashTag in hashTags: if hashTag.startswith('#'): hashTag = hashTag[1:] wordsFile.write(tweetId + ',' + hashTag + '\n') # tracking the tweets for checks. if tweeterId in tweets: tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] else: tweets[tweeterId] = {} tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] tweetCount += 1 except Exception, f: log.debug("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) else: log.debug("skipping file %s" % (_file))
def trackTweets(tweetFolder, vocab, fromDate, toDate, country, threshold): counts = {} regex = {} totalWords = 0 # building regex for each group for group in vocab: counts[group] = {} sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True) words = [] if len(sorted_tuples) <= 20: threshold = len(sorted_tuples) else: threshold = int(len(sorted_tuples) * threshold // 100) for (word, weight) in sorted_tuples[:threshold]: words.append(word) totalWords += 1 regex[group] = re.compile(r"\b%s\b" % "\\b|\\b".join(words), flags=re.IGNORECASE) log.info("tracking total of %d words" % totalWords) # for geoCoding tweets geo = Geo() tweetCount, tweetErrorCount = 0, 0 for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d") if fileDate >= fromDate and fileDate < toDate: log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet["interaction"]["created_at"][5:16] tweetDate = datetime.strptime(dateStr, "%d %b %Y") geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, " ", text) # remove urls text = re.sub( "[^A-Za-z_@#0-9]", " ", normalize_str(text, lower=True) ) # allow only alphaNumerics and twitter tags text = re.sub(" +", " ", text) # remove multiple spaces for group in regex: keywordsPresent = re.findall(regex[group], text) if len(keywordsPresent) > 0: keywordsPresent = list(set(keywordsPresent)) hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] hashTags.extend(keywordsPresent) for hashTag in hashTags: if hashTag.startswith("#"): hashTag = hashTag[1:] if hashTag in counts[group]: counts[group][hashTag] += 1 else: counts[group][hashTag] = 1 tweetCount += 1 except Exception, f: log.debug("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) else: log.debug("skipping file %s" % (_file))
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country, filesProcessed): log.info("inside preProcess") log.debug("fromDate-->" + fromDate.strftime("%d %b %Y")) log.debug("toDate-->" + toDate.strftime("%d %b %Y")) tweetCount, tweetErrorCount = 0, 0 tweets = {} # output files tweetedFile = open(outputFolder + '/tweeted.csv', 'w') mentionFile = open(outputFolder + '/mentioned.csv', 'w') retweetFile = open(outputFolder + '/retweet.csv', 'w') wordsFile = open(outputFolder + '/containsWord.csv', 'w') sentimentFile = open(outputFolder + '/sentiment.csv', 'w') tweetsFile = open(outputFolder + '/tweets.json', 'w') # build stop word list englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')] spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')] stopWordList = [] stopWordList.extend(englishStopWords) stopWordList.extend(spanishStopWords) log.info("# of keywords: " + str(len(keywordList))) log.info("tracking--> " + str(keywordList)) # build regular expression for keyword keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList), flags=re.IGNORECASE) # for geocoding tweets geo = Geo() log.info("filesProcessed-->" + str(filesProcessed)) for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (_file not in filesProcessed and fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: tweetCount, tweetErrorCount = 0, 0 for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') sentiment = getSentiment(jsonTweet) if sentiment == 0: continue geoList = geo.geo_normalize(jsonTweet) ctry, a1, a2, a3 = geoList[1:5] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces keywordsPresent = keywordRegex.search(text) if keywordsPresent is not None: words = text.split(" ") words = [w for w in words if len(w) > 2 and w not in stopWordList] words2 = [] for word in words: for w in word: if (word not in keywordList) and (w.isdigit() or w == '@'): break else: if word[0] == '#': word = word[1:] words2.append(word) tweetId = jsonTweet["twitter"]["id"] tweeterId = str(jsonTweet["interaction"]["author"]["id"]) mentions, retweets = getInteractions(jsonTweet) tweetedFile.write(tweeterId + ',' + tweetId + '\n') sentimentFile.write(tweetId + ',' + str(sentiment) + '\n') for userId in mentions: mentionFile.write(tweetId + ',' + userId + '\n') for userId in retweets: retweetFile.write(tweetId + ',' + userId + '\n') for word in words2: wordsFile.write(tweetId + ',' + word + '\n') # tracking the tweets for checks. if tweeterId in tweets: tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] else: tweets[tweeterId] = {} tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"] tweetCount += 1 except Exception, f: log.exception("error processing tweet %s", f) tweetErrorCount += 1 except Exception, e: log.exception("error processfing file %s", e) log.info("tweets used: %s" % str(tweetCount)) log.debug("tweetErrorCount : %s" % str(tweetErrorCount)) filesProcessed.append(_file) break else: log.debug("skipping file %s" % (_file))
"""%prog [options] Python source code - @todo """ __author__ = 'Patrick Butler' __email__ = '*****@*****.**' import Levenshtein from embers import geocode from embers import utils flu_words = utils.normalize_str( ("gripe, influenza, antivirales, fiebre, síntomas, tos, fatiga, " "dolor de garganta, estornudos, dolor de cabeza, antibióticos, " "Oseltamivir, Tamiflu, Tazamir, neumonía, " "intensa falta de respiración, náusea, infección, enfermedades, " "vómitos, escalofríos, medicina, medicamento, vacuna, enfermedad, " "enfermos, médico, clínica, hospital").decode("utf8")).split(", ") flu_words += utils.normalize_str( ("flu, influenza, antiviral, fever, symptoms, cough, fatigue, " "sore, throat, sneezing, headache, antibiotics, Tamiflu, Tazamir," "pneumonia, severe shortness of breath, nausea, infection, " "diseases, vomiting, chills, medicine, medicine, vaccine, disease, " "sick, doctor, clinic, hospital").decode("utf8")).split(", ") flu_words = list(set(flu_words)) # make list unique print flu_words #possible_tags = flu_words + geocode.cnames possible_tags = ["%s country:%s" % (word, country)
def final_tuples(self): id_list = self.return_unique_ids() for artl in self.artl_json: if "embersId" in artl: artl_id = artl["embersId"] else: artl_id = artl["embers_id"] if artl_id in id_list: id_list.remove(artl_id) else: continue if artl[u'location'][u'country'] not in self.country_list: continue latitude_num = float(artl[u'location'][u'lat']) try: longitude_num = float(artl[u'location'][u'lng']) except Exception: longitude_num = float(artl[u'location'][u'lon']) artl_dt = dtparse(artl['date']).date() if artl_dt < self.timestamp[0] or artl_dt >= self.timestamp[len(self.timestamp) - 1]: continue try: finalURL = (urlopen(artl['link'])).geturl() article_source = urlparse.urlparse(finalURL).netloc articleprovince = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[2] articlecountry = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[1] except Exception: continue if 'BasisEnrichment' not in artl: try: content_web = webarticle2text.extractFromURL(finalURL) except Exception: content_web = "" content_descr = artl['descr'] tokens = nltk.word_tokenize(content_descr) try: tokens_1 = nltk.word_tokenize(content_web) for word in tokens_1: tokens.append(word) except Exception: tokens_1 = [] else: POS_list = ["DIG", "PUNCT", "SYM", "SENT", "CM"] if not(not(artl['BasisEnrichment']['tokens'])): tokenlist = artl['BasisEnrichment']['tokens'] for element in tokenlist: if element['POS'] not in POS_list: tokens.append(element['value']) token_filtered = [] token_normalized = [] for a in xrange(len(self.timestamp) - 1): if self.timestamp[a] <= artl_dt < self.timestamp[a + 1]: timestampindex = self.start_ind + a break for word in tokens: word_split = re.split('(\W+)', word) if len(word_split) == 1: if len(word_split[0]) > 2 and len(word_split[0]) < 15: token_filtered.append(word) else: continue elif (len(word_split) == 3 and word_split[2] == '' and len(word_split[0]) > 2 and len(word_split[0]) < 15): token_filtered.append(word_split[0]) for word in token_filtered: try: if not self.contains_digits(word) and word not in self.stop_words: token_normalized.append(utils.normalize_str(word)) except Exception: continue token_unique = list(set(token_normalized)) for word in token_unique: self.final_dict[(word, (articleprovince, articlecountry), article_source, timestampindex)] += token_normalized.count(word) return