예제 #1
0
def gen_tags(tweet, users, geo=None):
    if not geo:
        geo = geocode.Geo()
    tags = []
    city, country = geo.geo_normalize(tweet, users)
    ctag = None
    if city:
        tags += ["city:" + city]
    if country:
        ctag = "country:" + country
        tags += [ctag]

    text = utils.normalize_str(tweet['interaction_content']).split(" ")
    for word in text:
        for f in flu_words:
            if f == "tos" and word == f:
                tags += [f]
            elif Levenshtein.ratio(word, f) >= .75:
                tags += [f]

    tags = list(set(tags))
    if ctag:
        tags += [tag + " " + ctag for tag in tags
                         if not tag.startswith("country:")]
    return tags
예제 #2
0
def resolve_conflict(mena_geo, la_geo):
    if (not isempty(la_geo['city']) and
            normalize_str(la_geo['city']) == normalize_str(mena_geo['country'])):
        return mena_geo
    la_policy_num = LA_POLICY.index(la_geo['source'].split(",")[0])
    mena_policy_num = MENA_POLICY.index(mena_geo['source'].split(",")[0])
    la_policy_weight = round(float(la_policy_num) / float(len(LA_POLICY)), 1)
    mena_policy_weight = round(float(mena_policy_num) / float(len(MENA_POLICY)), 1)

    if la_policy_weight == mena_policy_weight:
        # Most the coarser level warning is the best or the warning with the higher population
        if isempty(la_geo['city']) and not isempty(mena_geo['city']):
            return la_geo
        elif isempty(mena_geo['city']) and not isempty(la_geo['city']):
            return mena_geo
        else:
            return la_geo
    else:
        if la_policy_weight < mena_policy_weight:
            return la_geo
        else:
            return mena_geo
예제 #3
0
def resolve_conflict(mena_geo, la_geo):
    if (not isempty(la_geo['city']) and normalize_str(la_geo['city'])
            == normalize_str(mena_geo['country'])):
        return mena_geo
    la_policy_num = LA_POLICY.index(la_geo['source'].split(",")[0])
    mena_policy_num = MENA_POLICY.index(mena_geo['source'].split(",")[0])
    la_policy_weight = round(float(la_policy_num) / float(len(LA_POLICY)), 1)
    mena_policy_weight = round(
        float(mena_policy_num) / float(len(MENA_POLICY)), 1)

    if la_policy_weight == mena_policy_weight:
        # Most the coarser level warning is the best or the warning with the higher population
        if isempty(la_geo['city']) and not isempty(mena_geo['city']):
            return la_geo
        elif isempty(mena_geo['city']) and not isempty(la_geo['city']):
            return mena_geo
        else:
            return la_geo
    else:
        if la_policy_weight < mena_policy_weight:
            return la_geo
        else:
            return mena_geo
예제 #4
0
 def processTweet(self, jsonTweet):
     try:
         text = jsonTweet["interaction"]["content"]
         text = re.sub("[^A-Za-z_@#0-9]", " ", normalize_str(text))
         searchResult = self.rePattern.findall(text)
         candidateList = []
         if searchResult:
             self.tweetList.append(jsonTweet["embersId"])
             for term in searchResult:
                 for candidate in self.aliasJson[term]:
                     candidateList.append(candidate)
             log.debug("matched aliases---> %s" % candidateList)
         return set(candidateList)
     except Exception, e:
         raise e
예제 #5
0
def execute(arg):
    logs.init(arg)

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    tweetFolder = arg.tweetFolder
    country = arg.country

    hashTagCounts = {}
    uids = {}

    # loading twitter handles from a file
    with open(arg.seedFile, 'r') as _file:
        for line in _file:
            handle, candidate = line.strip().split(',')
            if candidate not in uids:
                uids[candidate] = []
                hashTagCounts[candidate] = {}
                uids[candidate].append(handle.lower())
            else:
                uids[candidate].append(handle.lower())

    # for geolocation
    geo = Geo()

    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')
        if (fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                userId, realName = None, None
                                if 'twiiter' in jsonTweet:
                                    if 'user' in jsonTweet['twitter']:
                                        if 'screen_name' in jsonTweet['twitter']['user']:
                                            userId = jsonTweet['twitter']['user']['screen_name'].lower()
                                        if 'name' in jsonTweet['twitter']['user']:
                                            realName = jsonTweet['twitter']['user']['name'].lower()
                                if userId is None and realName is None:
                                    continue
                                log.debug('userId or realName is not None')
                                candidate = getCandidate(userId, realName, uids)
                                if candidate is not None:
                                    log.debug('found candidate--> ' + candidate)
                                    # prereProcess the tweet
                                    text = jsonTweet["interaction"]["content"]
                                    text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                    text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                    text = re.sub(' +', ' ', text)  # remove multiple spaces
                                    hashTags = extract_hash_tags(text)
                                    hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                    for hashTag in hashTags:
                                        if hashTag.startswith('#'):
                                            hashTag = hashTag[1:]
                                        if hashTag in hashTagCounts[candidate]:
                                            hashTagCounts[candidate][hashTag] += 1
                                        else:
                                            hashTagCounts[candidate][hashTag] = 1
                        except Exception, e:
                            log.exception('error processing tweet %s' % e)
            except Exception, f:
                log.exception('error processing file %s' % f)
        else:
            log.debug('skipping file %s ' % _file)
예제 #6
0
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country):
    log.info("inside preProcess")
    log.debug("fromDate-->" + fromDate.strftime("%d %b %Y"))
    log.debug("toDate-->" + toDate.strftime("%d %b %Y"))

    tweets = {}

    # output files
    tweetedFile = open(outputFolder + '/tweeted.csv', 'w')
    mentionFile = open(outputFolder + '/mentioned.csv', 'w')
    # retweetFile = open(outputFolder + '/retweet.csv', 'w')
    wordsFile = open(outputFolder + '/containsWord.csv', 'w')
    sentimentFile = open(outputFolder + '/sentiment.csv', 'w')
    tweetsFile = open(outputFolder + '/tweets.json', 'w')

    # build stop word list
    # englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')]
    # spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')]
    # stopWordList = []
    # stopWordList.extend(englishStopWords)
    # stopWordList.extend(spanishStopWords)

    log.info("# of keywords: " + str(len(keywordList)))
    log.info("tracking--> " + str(keywordList))
    # build regular expression for keyword
    keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList),
                              flags=re.IGNORECASE)
    # for geocoding tweets
    geo = Geo()

    tweetCount, tweetErrorCount = 0, 0

    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')
        if (fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                # prereProcess the tweet
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                text = re.sub(' +', ' ', text)  # remove multiple spaces

                                keywordsPresent = re.findall(keywordRegex, text)
                                keywordsPresent = list(set(keywordsPresent))
                                if len(keywordsPresent) > 0:
                                    tweetId = jsonTweet["twitter"]["id"]
                                    tweeterId = str(jsonTweet["interaction"]["author"]["id"])
                                    mentions = getInteractions(jsonTweet)
                                    sentiment = getSentiment(jsonTweet)

                                    hashTags = extract_hash_tags(text)
                                    hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                    #hashTags.extend(keywordsPresent)
                                    if len(hashTags) == 0:
                                        continue
                                    hashTags = list(set(hashTags).union(set(keywordsPresent)))

                                    tweetedFile.write(tweeterId + ',' + tweetId + '\n')
                                    sentimentFile.write(tweetId + ',' + str(sentiment) + '\n')
                                    for userId in mentions:
                                        mentionFile.write(tweetId + ',' + userId + '\n')
                                        # for userId in retweets:
                                        #     retweetFile.write(tweetId + ',' + userId + '\n')
                                    for hashTag in hashTags:
                                        if hashTag.startswith('#'):
                                            hashTag = hashTag[1:]
                                        wordsFile.write(tweetId + ',' + hashTag + '\n')
                                        # tracking the tweets for checks.
                                    if tweeterId in tweets:
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]
                                    else:
                                        tweets[tweeterId] = {}
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]

                                    tweetCount += 1
                        except Exception, f:
                            log.debug("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
        else:
            log.debug("skipping file %s" % (_file))
예제 #7
0
def trackTweets(tweetFolder, vocab, fromDate, toDate, country, threshold):
    counts = {}
    regex = {}
    totalWords = 0
    # building regex for each group
    for group in vocab:
        counts[group] = {}
        sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True)
        words = []
        if len(sorted_tuples) <= 20:
            threshold = len(sorted_tuples)
        else:
            threshold = int(len(sorted_tuples) * threshold // 100)
        for (word, weight) in sorted_tuples[:threshold]:
            words.append(word)
            totalWords += 1
        regex[group] = re.compile(r"\b%s\b" % "\\b|\\b".join(words), flags=re.IGNORECASE)

    log.info("tracking total of %d words" % totalWords)
    # for geoCoding tweets
    geo = Geo()

    tweetCount, tweetErrorCount = 0, 0
    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], "%Y-%m-%d")
        if fileDate >= fromDate and fileDate < toDate:
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet["interaction"]["created_at"][5:16]
                            tweetDate = datetime.strptime(dateStr, "%d %b %Y")
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                # prereProcess the tweet
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, " ", text)  # remove urls
                                text = re.sub(
                                    "[^A-Za-z_@#0-9]", " ", normalize_str(text, lower=True)
                                )  # allow only alphaNumerics and twitter tags
                                text = re.sub(" +", " ", text)  # remove multiple spaces
                                for group in regex:
                                    keywordsPresent = re.findall(regex[group], text)
                                    if len(keywordsPresent) > 0:
                                        keywordsPresent = list(set(keywordsPresent))
                                        hashTags = extract_hash_tags(text)
                                        hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                        hashTags.extend(keywordsPresent)
                                        for hashTag in hashTags:
                                            if hashTag.startswith("#"):
                                                hashTag = hashTag[1:]
                                            if hashTag in counts[group]:
                                                counts[group][hashTag] += 1
                                            else:
                                                counts[group][hashTag] = 1
                                        tweetCount += 1
                        except Exception, f:
                            log.debug("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
        else:
            log.debug("skipping file %s" % (_file))
예제 #8
0
def preProcess(tweetFolder, outputFolder, keywordList, fromDate, toDate, country, filesProcessed):
    log.info("inside preProcess")
    log.debug("fromDate-->" + fromDate.strftime("%d %b %Y"))
    log.debug("toDate-->" + toDate.strftime("%d %b %Y"))

    tweetCount, tweetErrorCount = 0, 0
    tweets = {}

    # output files
    tweetedFile = open(outputFolder + '/tweeted.csv', 'w')
    mentionFile = open(outputFolder + '/mentioned.csv', 'w')
    retweetFile = open(outputFolder + '/retweet.csv', 'w')
    wordsFile = open(outputFolder + '/containsWord.csv', 'w')
    sentimentFile = open(outputFolder + '/sentiment.csv', 'w')
    tweetsFile = open(outputFolder + '/tweets.json', 'w')

    # build stop word list
    englishStopWords = [normalize_str(w).lower() for w in stopwords.words('english')]
    spanishStopWords = [normalize_str(w).lower() for w in stopwords.words('spanish')]
    stopWordList = []
    stopWordList.extend(englishStopWords)
    stopWordList.extend(spanishStopWords)

    log.info("# of keywords: " + str(len(keywordList)))
    log.info("tracking--> " + str(keywordList))
    # build regular expression for keyword
    keywordRegex = re.compile(r'\b%s\b' % '\\b|\\b'.join(keywordList),
                              flags=re.IGNORECASE)

    # for geocoding tweets
    geo = Geo()

    log.info("filesProcessed-->" + str(filesProcessed))
    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')

        if (_file not in filesProcessed and fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    tweetCount, tweetErrorCount = 0, 0
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            sentiment = getSentiment(jsonTweet)
                            if sentiment == 0:
                                continue
                            geoList = geo.geo_normalize(jsonTweet)
                            ctry, a1, a2, a3 = geoList[1:5]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                text = jsonTweet["interaction"]["content"]
                                text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                text = re.sub(' +', ' ', text)  # remove multiple spaces

                                keywordsPresent = keywordRegex.search(text)
                                if keywordsPresent is not None:
                                    words = text.split(" ")
                                    words = [w for w in words if len(w) > 2 and w not in stopWordList]
                                    words2 = []
                                    for word in words:
                                        for w in word:
                                            if (word not in keywordList) and (w.isdigit() or w == '@'):
                                                break
                                        else:
                                            if word[0] == '#':
                                                word = word[1:]
                                            words2.append(word)

                                    tweetId = jsonTweet["twitter"]["id"]
                                    tweeterId = str(jsonTweet["interaction"]["author"]["id"])
                                    mentions, retweets = getInteractions(jsonTweet)

                                    tweetedFile.write(tweeterId + ',' + tweetId + '\n')
                                    sentimentFile.write(tweetId + ',' + str(sentiment) + '\n')
                                    for userId in mentions:
                                        mentionFile.write(tweetId + ',' + userId + '\n')
                                    for userId in retweets:
                                        retweetFile.write(tweetId + ',' + userId + '\n')
                                    for word in words2:
                                        wordsFile.write(tweetId + ',' + word + '\n')
                                    # tracking the tweets for checks.
                                    if tweeterId in tweets:
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]
                                    else:
                                        tweets[tweeterId] = {}
                                        tweets[tweeterId][tweetId] = jsonTweet["interaction"]["content"]

                                    tweetCount += 1
                        except Exception, f:
                            log.exception("error processing tweet %s", f)
                            tweetErrorCount += 1
            except Exception, e:
                log.exception("error processfing file %s", e)
            log.info("tweets used: %s" % str(tweetCount))
            log.debug("tweetErrorCount : %s" % str(tweetErrorCount))
            filesProcessed.append(_file)
            break
        else:
            log.debug("skipping file %s" % (_file))
예제 #9
0
"""%prog [options]
Python source code - @todo
"""

__author__ = 'Patrick Butler'
__email__ = '*****@*****.**'

import Levenshtein

from embers import geocode
from embers import utils

flu_words = utils.normalize_str(
          ("gripe, influenza, antivirales, fiebre, síntomas, tos, fatiga, "
           "dolor de garganta, estornudos, dolor de cabeza, antibióticos, "
           "Oseltamivir, Tamiflu, Tazamir, neumonía, "
           "intensa falta de respiración, náusea, infección, enfermedades, "
           "vómitos, escalofríos, medicina, medicamento, vacuna, enfermedad, "
           "enfermos, médico, clínica, hospital").decode("utf8")).split(", ")

flu_words += utils.normalize_str(
        ("flu, influenza, antiviral, fever, symptoms, cough, fatigue, "
         "sore, throat, sneezing, headache, antibiotics, Tamiflu, Tazamir,"
         "pneumonia, severe shortness of breath, nausea, infection, "
         "diseases, vomiting, chills, medicine, medicine, vaccine, disease, "
         "sick, doctor, clinic, hospital").decode("utf8")).split(", ")

flu_words = list(set(flu_words))  # make list unique
print flu_words
#possible_tags = flu_words + geocode.cnames
possible_tags = ["%s country:%s" % (word, country)
예제 #10
0
    def final_tuples(self):

        id_list = self.return_unique_ids()
        for artl in self.artl_json:
            if "embersId" in artl:
                artl_id = artl["embersId"]
            else:
                artl_id = artl["embers_id"]
            if artl_id in id_list:
                id_list.remove(artl_id)
            else:
                continue

            if artl[u'location'][u'country'] not in self.country_list:
                continue
            latitude_num = float(artl[u'location'][u'lat'])
            try:
                longitude_num = float(artl[u'location'][u'lng'])
            except Exception:
                longitude_num = float(artl[u'location'][u'lon'])
            artl_dt = dtparse(artl['date']).date()
            if artl_dt < self.timestamp[0] or artl_dt >= self.timestamp[len(self.timestamp) - 1]:
                continue
            try:
                finalURL = (urlopen(artl['link'])).geturl()
                article_source = urlparse.urlparse(finalURL).netloc
                articleprovince = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[2]
                articlecountry = list(locationcls.lookup_city(latitude_num, longitude_num, 360.)[0])[1]
            except Exception:
                continue
            if 'BasisEnrichment' not in artl:
                try:
                    content_web = webarticle2text.extractFromURL(finalURL)
                except Exception:
                    content_web = ""
                content_descr = artl['descr']
                tokens = nltk.word_tokenize(content_descr)
                try:
                    tokens_1 = nltk.word_tokenize(content_web)
                    for word in tokens_1:
                        tokens.append(word)
                except Exception:
                    tokens_1 = []
            else:
                POS_list = ["DIG", "PUNCT", "SYM", "SENT", "CM"]
                if not(not(artl['BasisEnrichment']['tokens'])):
                    tokenlist = artl['BasisEnrichment']['tokens']
                for element in tokenlist:
                    if element['POS'] not in POS_list:
                        tokens.append(element['value'])
            token_filtered = []
            token_normalized = []
            for a in xrange(len(self.timestamp) - 1):
                if self.timestamp[a] <= artl_dt < self.timestamp[a + 1]:
                    timestampindex = self.start_ind + a
                    break
            for word in tokens:
                word_split = re.split('(\W+)', word)
                if len(word_split) == 1:
                    if len(word_split[0]) > 2 and len(word_split[0]) < 15:
                        token_filtered.append(word)
                    else:
                        continue
                elif (len(word_split) == 3 and word_split[2] == '' and len(word_split[0]) > 2 and len(word_split[0]) < 15):
                    token_filtered.append(word_split[0])
            for word in token_filtered:
                try:
                    if not self.contains_digits(word) and word not in self.stop_words:
                        token_normalized.append(utils.normalize_str(word))
                except Exception:
                    continue
            token_unique = list(set(token_normalized))
            for word in token_unique:
                self.final_dict[(word, (articleprovince, articlecountry), article_source, timestampindex)] += token_normalized.count(word)
        return