def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"):
        if api is None:
            api = self.getAppObject()

        queryParam = {}
        queryParam['locations'] = GeoLocationModule.getGeoArea(
            area=locationArea)
        queryParam['rpp'] = 100
        while True:
            try:
                time.sleep(
                    int(3600 / 100) + 4
                )  # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated
                iterator = api.request('statuses/filter',
                                       queryParam).get_iterator()
                for item in iterator:
                    if 'text' in item:
                        print('\n\n\n' + item[u'lang'] + ":\t" +
                              item['text'].encode('utf-8').strip())
                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace(
                            "#", " ")  # remove hashtags
                        rawTextClean4 = re.sub(
                            r'https?:\/\/.*[\r\n]*',
                            '',
                            rawTextClean3,
                            flags=re.MULTILINE)  # remove urls
                        rawEnText = TranslationModule.getEnglish(rawTextClean4)
                        fineEnText = rawEnText.replace(",",
                                                       " ").replace(";", " ")
                        print(self.getEmoTaggerObject().consolodateResult(
                            fineEnText))
                    elif 'disconnect' in item:
                        event = item['disconnect']
                        if event['code'] in [2, 5, 6, 7]:
                            # something needs to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

            except TwitterRequestError as e:
                if e.status_code < 500:
                    # something needs to be fixed before re-connecting
                    raise
                    # print "\n\nMJAGLAN EXCEPTION:\n"+str(e)+"\n\n"
                    # pass
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # temporary interruption, re-try request
                pass
    def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"):
        if api is None:
            api = self.getAppObject()

        queryParam = {}
        queryParam['locations'] = GeoLocationModule.getGeoArea(area=locationArea)
        queryParam['rpp'] = 100
        while True:
            try:
                time.sleep(int(3600/100)+4) # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated
                iterator = api.request('statuses/filter', queryParam).get_iterator()
                for item in iterator:
                    if 'text' in item:
                        print('\n\n\n' + item[u'lang'] + ":\t" +  item['text'].encode('utf-8').strip())
                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace("#"," ")  # remove hashtags
                        rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls
                        rawEnText = TranslationModule.getEnglish(rawTextClean4)
                        fineEnText = rawEnText.replace(",", " ").replace(";", " ")
                        print(self.getEmoTaggerObject().consolodateResult(fineEnText))
                    elif 'disconnect' in item:
                        event = item['disconnect']
                        if event['code'] in [2,5,6,7]:
                            # something needs to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

            except TwitterRequestError as e:
                if e.status_code < 500:
                    # something needs to be fixed before re-connecting
                    raise
                    # print "\n\nMJAGLAN EXCEPTION:\n"+str(e)+"\n\n"
                    # pass
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # temporary interruption, re-try request
                pass
示例#3
0
    def getFeedsByText(self,
                       api=None,
                       f1=None,
                       isLive=True,
                       annotation=None,
                       queryText=u'a',
                       textLang=None,
                       isTrain=False,
                       locationArea=None):

        if api is None:
            api = self.getAppObject()

        iteratorRunCount = 0
        isDuplicateList = []
        tweetsRecorded = 0
        reTryCount = 0
        MAX_TWEET = 20
        MAX_TRIES = 10
        queryParam = {}
        while True:
            try:

                # TODO: think of better ways to handle this
                if (
                        iteratorRunCount >= 10
                ):  # hack, this limits the number of tweets you want to retrieve
                    print(
                        "\n ASSUMPTION: there are no tweets as of now. Let's go back! \n\n"
                    )
                    print(u"\n\n\n")
                    return
                else:
                    # try some iteration with original search
                    pass

                # Let's take 60 seconds pause before each API call;
                # 2017 twitter rate limit is 15 API calls per 15 mins in total per account;
                time.sleep(60)

                if (textLang is not None):
                    queryText = queryText + ' lang:' + textLang

                queryParam['rpp'] = 100
                if ((isLive == True) or (queryText is None)):
                    if queryText is not None:
                        queryParam['track'] = queryText
                    if (locationArea is None) and (queryText is not None):
                        # live tweets without location filter
                        iterator = api.request('statuses/filter',
                                               queryParam).get_iterator()
                    elif (locationArea is not None) and (queryText
                                                         is not None):
                        # live tweets with location filter
                        queryParam['locations'] = GeoLocationModule.getGeoArea(
                            area=locationArea)
                        iterator = api.request('statuses/filter',
                                               queryParam).get_iterator()
                    elif (locationArea is not None) and (queryText is None):
                        self.liveFeedsByLocation(api=api,
                                                 locationArea=locationArea)
                    else:
                        print(
                            "ERROR: locationArea and queryText cannot be None together"
                        )
                        exit(-1)

                else:  # isLive==False
                    queryParam['q'] = queryText
                    if locationArea is None:
                        # search tweets without location filter
                        iterator = api.request('search/tweets',
                                               queryParam).get_iterator()
                    else:
                        # search tweets with location filter
                        queryParam['locations'] = GeoLocationModule.getGeoArea(
                            area=locationArea)
                        iterator = api.request('search/tweets',
                                               queryParam).get_iterator()

                iteratorRunCount += 1

                for item in iterator:
                    if (('text' in item)
                            and (item[u'id'] not in isDuplicateList)
                            and (item[u'retweeted'] == False)):

                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace(
                            "#", " ")  # remove hashtags
                        rawTextClean4 = re.sub(
                            r'https?:\/\/.*[\r\n]*',
                            '',
                            rawTextClean3,
                            flags=re.MULTILINE)  # remove urls

                        if (25 < len(rawTextClean4)) and (
                                len(item[u'text']) <
                                140):  # take tweets with sufficient text
                            isDuplicateList.append(item[u'id'])
                            tweetsRecorded += 1

                            rawEnText = TranslationModule.getEnglish(
                                rawTextClean4)
                            fineEnText = rawEnText.replace(",", " ").replace(
                                ";", " ")
                            print(
                                str(tweetsRecorded) + ":\t" + item[u'lang'] +
                                ",\t\t" + annotation.lower() + "\t\t:" +
                                queryText + "\t\t:" + str(len(fineEnText)))
                            print(fineEnText)

                            emoVector = self.getEmoTaggerObject(
                            ).consolodateResult(fineEnText)
                            listRes = []
                            keyRes = sorted(emoVector)
                            for key in keyRes:
                                listRes.append(emoVector[key])
                            print(listRes, keyRes)

                            listStr1 = str(listRes).replace(",", " ")
                            listStr2 = listStr1[1:-1]
                            listStr3 = listStr2.split()
                            listVector = [
                                float(i) for i in listStr3
                                if Utility.RepresentsNum(i)
                            ]

                            emoLabel = annotation
                            if len(listVector) != 0:
                                assert (len(listVector) == 8
                                        )  # emo-vector length should be 8;
                                if True:  # Training Only
                                    emoTypesCount = 0
                                    for i in range(0, 8, 1):
                                        if (listVector[i] > 0.0):
                                            emoTypesCount += 1

                                    if (emoTypesCount == 0):
                                        emoLabel = "neutral"
                                        print(">> No Emotion \n\n\n")
                                        continue
                                    elif (emoTypesCount >= 5):
                                        emoLabel = "mixed"
                                        print(">> Mixed Emotion \n\n\n")
                                        continue
                                    else:
                                        emoLabel = annotation

                            if isTrain == True:
                                f1.write(
                                    unicode(item[u'id_str']) + "," +
                                    unicode(item[u'created_at']) + "," +
                                    unicode(item[u'lang']) + "," +
                                    unicode(emoLabel).lower() +
                                    "," + unicode(fineEnText).replace(
                                        "\n", " ").replace("\r", " ") + "," +
                                    "\n")
                                f1.flush()
                                os.fsync(f1.fileno())
                            else:
                                Supervised.getPrediction(npVector=numpy.array(
                                    [listRes]),
                                                         model='NBC')
                                Supervised.getPrediction(npVector=numpy.array(
                                    [listRes]),
                                                         model='SVC')

                            if (tweetsRecorded >= MAX_TWEET) or (reTryCount >=
                                                                 MAX_TRIES):
                                print("\n ReTry Count: " + str(reTryCount) +
                                      "\n\n")
                                print(u"\n\n\n")
                                return

                            print(u"\n\n\n")

                    elif 'disconnect' in item:
                        event = item['disconnect']
                        reTryCount += 1

                        if event['code'] in [2, 5, 6, 7]:
                            # something may or may NOT need to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

                    elif (iteratorRunCount > 0) and (
                            tweetsRecorded < MAX_TWEET
                    ):  # Condition when no more unique tweets are found, go back
                        # TODO: think of better ways to handle this
                        if (queryText[0] == '#'):
                            return  # temporary return
                            queryText = queryText[1:]
                            break
                        else:
                            print("\n No more tweets as of now \n\n")
                            print(u"\n\n\n")
                            return

                    else:
                        pass

            except TwitterRequestError as e:
                if e.status_code < 500:
                    print "\n\n" + "MJAGLAN EXCEPTION:\n" + str(e) + "\n\n"
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # temporary interruption, re-try request
                pass
示例#4
0
    def liveFeedsByLocation(self, api=None, locationArea="New York City, NY"):
        if api is None:
            api = self.getAppObject()

        queryParam = {}
        queryParam['locations'] = GeoLocationModule.getGeoArea(
            area=locationArea)
        queryParam['rpp'] = 100
        while True:
            try:
                # Let's take 60 seconds pause before next API call;
                # 2017 twitter rate limit is 15 API calls per 15 mins in total per account;
                time.sleep(60)

                iterator = api.request('statuses/filter',
                                       queryParam).get_iterator()
                for item in iterator:
                    if 'text' in item:
                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace(
                            "#", " ")  # remove hashtags
                        rawTextClean4 = re.sub(
                            r'https?:\/\/.*[\r\n]*',
                            '',
                            rawTextClean3,
                            flags=re.MULTILINE)  # remove urls
                        rawEnText = TranslationModule.getEnglish(rawTextClean4)
                        fineEnText = rawEnText.replace(",",
                                                       " ").replace(";", " ")
                        emoVector = self.getEmoTaggerObject(
                        ).consolodateResult(fineEnText)
                        listRes = []
                        keyRes = sorted(emoVector)
                        for key in keyRes:
                            listRes.append(emoVector[key])

                        ##### CONSOLE #
                        print '{}: {}'.format(item[u'lang'], rawTextClean2)
                        print(zip(keyRes, listRes))
                        print '\n\n\n'
                    elif 'disconnect' in item:
                        event = item['disconnect']
                        if event['code'] in [2, 5, 6, 7]:
                            # something needs to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

            except TwitterRequestError as e:
                if e.status_code < 500:
                    # something needs to be fixed before re-connecting
                    print "\n\nSomething needs to be fixed before re-connecting:\n" + str(
                        e) + "\n\n"
                    pass
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # temporary interruption, re-try request
                pass
    def getFeedsByText(self, api=None, f1=None, isLive=True, annotation=None,
                       queryText=u'a', textLang=None, isTrain=False, locationArea=None):

        if api is None:
            api = self.getAppObject()

        iteratorRunCount = 0
        isDuplicateList = []
        tweetsRecorded = 0
        reTryCount = 0
        MAX_TWEET = 20
        MAX_TRIES = 10
        queryParam = {}
        while True:
            try:

                # TODO: think of better ways to handle this
                if (iteratorRunCount >= 10): # hack, this limits the number of tweets you want to retrieve
                    print( "\n ASSUMPTION: there are no tweets as of now. Let's go back! \n\n")
                    print(u"\n\n\n")
                    return
                else:
                    # try some iteration with original search
                    pass

                time.sleep(int(3600/100)+4) # Let's take 40 seconds pause; twitter rate limit is 100 API calls per hour in total per account; source: https://blog.twitter.com/2008/what-does-rate-limit-exceeded-mean-updated

                if (textLang is not None):
                    queryText = queryText + ' lang:' + textLang

                queryParam['rpp'] = 100
                if ((isLive==True) or (queryText is None)):
                    if queryText is not None:
                        queryParam['track'] = queryText
                    if (locationArea is None) and (queryText is not None):
                        # live tweets without location filter
                        iterator = api.request('statuses/filter', queryParam).get_iterator()
                    elif (locationArea is not None)and (queryText is not None):
                        # live tweets with location filter
                        queryParam['locations']=GeoLocationModule.getGeoArea(area=locationArea)
                        iterator = api.request('statuses/filter', queryParam).get_iterator()
                    elif (locationArea is not None)and (queryText is None):
                        self.liveFeedsByLocation(api=api, locationArea=locationArea)
                    else:
                        print("ERROR: locationArea and queryText cannot be None together")
                        exit(-1)

                else: # isLive==False
                    queryParam['q'] = queryText
                    if locationArea is None:
                        # search tweets without location filter
                        iterator = api.request('search/tweets', queryParam).get_iterator()
                    else:
                        # search tweets with location filter
                        queryParam['locations']=GeoLocationModule.getGeoArea(area=locationArea)
                        iterator = api.request('search/tweets', queryParam).get_iterator()

                iteratorRunCount += 1

                for item in iterator:
                    if ( ('text' in item) and (item[u'id'] not in isDuplicateList) and (item[u'retweeted']==False) ):

                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace("#"," ")  # remove hashtags
                        rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls

                        if (25 < len(rawTextClean4))   and   (len(item[u'text']) < 140): # take tweets with sufficient text
                            isDuplicateList.append(item[u'id'])
                            tweetsRecorded += 1

                            rawEnText = TranslationModule.getEnglish(rawTextClean4)
                            fineEnText = rawEnText.replace(",", " ").replace(";", " ")
                            print( str(tweetsRecorded) + ":\t" + item[u'lang'] + ",\t\t" + annotation.lower() + "\t\t:" + queryText + "\t\t:" + str(len(fineEnText)) + "\n\t:" + fineEnText)

                            emoVector = self.getEmoTaggerObject().consolodateResult(fineEnText)
                            listRes = []
                            keyRes  = sorted(emoVector)
                            for key in keyRes:
                                listRes.append(emoVector[key])
                            print(listRes, keyRes)

                            listStr1 = str(listRes).replace(",", " ")
                            listStr2 = listStr1[1:-1]
                            listStr3 = listStr2.split()
                            listVector = [float(i) for i in listStr3 if Utility.RepresentsNum(i)]

                            emoLabel = annotation
                            if len(listVector) != 0:
                                assert (len(listVector) == 8) # emo-vector length should be 8;
                                if True: # Training Only
                                    emoTypesCount = 0
                                    for i in range(0,8,1):
                                        if (listVector[i] > 0.0):
                                            emoTypesCount += 1

                                    if (emoTypesCount == 0):
                                        emoLabel = "neutral"
                                        print(">> No Emotion \n\n\n")
                                        continue
                                    elif (emoTypesCount >= 5):
                                        emoLabel = "mixed"
                                        print(">> Mixed Emotion \n\n\n")
                                        continue
                                    else:
                                        emoLabel = annotation

                            if isTrain == True:
                                f1.write( unicode(item[u'id_str']) + "," + unicode(item[u'created_at']) + "," + unicode(item[u'lang']) + "," + unicode(emoLabel).lower() + "," + unicode(fineEnText).replace("\n", " ").replace("\r", " ") + "," + "\n" )
                                f1.flush()
                                os.fsync(f1.fileno())
                            else:
                                Supervised.getPrediction(npVector = numpy.array([ listRes ]), model='NBC')
                                Supervised.getPrediction(npVector = numpy.array([ listRes ]), model='SVC')

                            if (tweetsRecorded >= MAX_TWEET) or (reTryCount >= MAX_TRIES):
                                print( "\n ReTry Count: " + str(reTryCount) + "\n\n")
                                print(u"\n\n\n")
                                return

                            print(u"\n\n\n")

                    elif 'disconnect' in item:
                        event = item['disconnect']
                        reTryCount += 1

                        if event['code'] in [2,5,6,7]:
                            # something may or may NOT need to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

                    elif (iteratorRunCount > 0) and (tweetsRecorded < MAX_TWEET): # Condition when no more unique tweets are found, go back
                        # TODO: think of better ways to handle this
                        if (queryText[0] == '#'):
                            return # temporary return
                            queryText = queryText[1:]
                            break
                        else:
                            print( "\n No more tweets as of now \n\n")
                            print(u"\n\n\n")
                            return

                    else:
                        pass

            except TwitterRequestError as e:
                if e.status_code < 500:
                    print "\n\n" + "MJAGLAN EXCEPTION:\n" + str(e) + "\n\n"
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # temporary interruption, re-try request
                pass
    def liveFeedsByLocation(self, api=None, locationArea="New York City, NY", filePath=None):
        if api is None:
            api = self.getAppObject()

        if filePath == self.globalCSVDataStorePath:
            print("WARNING: Attempt to write on Train Data File!")
            exit(-1)
        else:
            f2 = open(filePath, 'a+')

        langCount = {}
        langEmo = {}
        queryParam = {}
        queryParam['locations'] = GeoLocationModule.getGeoArea(area=locationArea)
        queryParam['rpp'] = 100
        recordCount = 0
        while True:
            try:
                # Let's take 60 seconds pause before next API call;
                # 2017 twitter rate limit is 15 API calls per 15 mins in total per account;
                time.sleep(60)

                iterator = api.request('statuses/filter', queryParam).get_iterator()
                for item in iterator:
                    if 'text' in item:
                        if item[u'lang'] in langCount.keys():
                            langCount[item[u'lang']] += 1
                        else:
                            langCount[item[u'lang']] = 1

                        rawTextClean1 = item[u'text'].encode('utf-8')
                        rawTextClean2 = rawTextClean1.strip()
                        rawTextClean3 = rawTextClean2.replace("#"," ")  # remove hashtags
                        rawTextClean4 = re.sub(r'https?:\/\/.*[\r\n]*', '', rawTextClean3, flags=re.MULTILINE) # remove urls
                        rawEnText = TranslationModule.getEnglish(rawTextClean4)
                        fineEnText = rawEnText.replace(",", " ").replace(";", " ")
                        emoVector = self.getEmoTaggerObject().consolodateResult(fineEnText)
                        listRes = []
                        keyRes  = sorted(emoVector)
                        for key in keyRes:
                            listRes.append(emoVector[key])

                        ##### CONSOLE #
                        print '{}: {}'.format(item[u'lang'], rawTextClean2)
                        print (zip(keyRes, listRes))
                        print '\n\n\n'

                        listStr1 = str(listRes).replace(",", " ")
                        listStr2 = listStr1[1:-1]
                        listStr3 = listStr2.split()
                        listVector = [float(i) for i in listStr3 if Utility.RepresentsNum(i)]
                        if len(listVector) == 0:
                            listVector = [0.0] * len(langEmo[langEmo.keys()[0]])
                        if item[u'lang'] in langEmo.keys():
                                langEmo[item[u'lang']] = [sum(x) for x in zip(langEmo[item[u'lang']], listVector)]
                        else:
                            langEmo[item[u'lang']] = listVector

                        assert len(langCount.keys()) == len(langEmo.keys()) ################################## ASSERT #
                        recordCount += 1

                        # if recordCount > 500:
                        #     print("record count: " + str(recordCount))
                        #     recordCount = 0
                        #     f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," +  "\n")
                        #     for aKey in langCount:
                        #         f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," +  "\n")
                        #     f2.write( "\n\n\n")
                        #     f2.flush()
                        #     os.fsync(f2.fileno())

                    elif 'disconnect' in item:
                        event = item['disconnect']
                        if event['code'] in [2,5,6,7]:
                            # something needs to be fixed before re-connecting
                            raise Exception(event['reason'])
                        else:
                            # temporary interruption, re-try request
                            break

            except TwitterRequestError as e:
                print("record count: " + str(recordCount))
                f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," +  "\n")
                for aKey in langCount:
                    f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," +  "\n")
                f2.write( "\n\n\n")
                f2.flush()
                os.fsync(f2.fileno())

                if e.status_code < 500:
                    # something needs to be fixed before re-connecting
                    print "\n\nSomething needs to be fixed before re-connecting:\n"+str(e)+"\n\n"
                    pass
                else:
                    # temporary interruption, re-try request
                    pass

            except TwitterConnectionError:
                # print("record count: " + str(recordCount))
                f2.write( u'LOCATION' + "," + u"LANGUAGE" + "," + u"LANGUAGE_COUNT" + "," + u'EMO VECTOR' + "," +  "\n")
                for aKey in langCount:
                    f2.write( unicode(locationArea) + "," + unicode(aKey) + "," + unicode(langCount[aKey]) + "," + unicode(langEmo[aKey]) + "," +  "\n")
                f2.write( "\n\n\n")
                f2.flush()
                os.fsync(f2.fileno())

                # temporary interruption, re-try request
                pass