Python TweetSnap примеры использования

Язык программирования: Python

Пространство имен/Пакет: retreiver

Класс/Тип: TweetSnap

Примеров на hotexamples.com: 10

Python TweetSnap - 10 примеров найдено. Это лучшие примеры Python кода для retreiver.TweetSnap, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

move_on(8)

next(7)

Пример #1

Показать файл

Файл: visualization.py Проект: shkr/tweet-event

def visualize_word(db,timeWindow,given_word,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  Virality = []
  Volume   = []
  Locality = []
  TimeWindow=[]
  Word     = []

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    #gmm  = GMM_clustering()
    for item in given_word:
      #1. Virality
      #Virality.append(PoissonRate(snap,given_word=item))
      #2. Locality
      #gmm.Snap = snap
      #gmm.build_clusters()
      #Locality.append(GeographicalEntropy(snap,gmm.labels,given_word=item))
      #3. Volume
      Volume.append(Count(snap,given_word=item))
      #4. TimeWindow
      TimeWindow.append(snap['TimeWindow'][0])
      #5. Word
      Word.append(item)

  #Prepare Dataframe
  df             = pd.DataFrame({'Virality':Virality,'Locality':Locality,'Volume':Volume,'TimeWindow':TimeWindow,'Word':Word})

  return df

Пример #2

Показать файл

Файл: algo.py Проект: shkr/tweet-event

    def __init__(self, db, timeWindow=60 * 10, **kwargs):

        print "COLLECTING TWEETS...."
        self.TS = TweetSnap(db=db, timeWindow=timeWindow, Placename2Geocode=False)
        print "COLLECTION OVER...."

        # Variables
        self.SnapStack = []
        self.Candidates = {}
        self.Volume = []

        # Constants
        self.delta = 1.5
        self.enoughSamples = 15.0
        self.SnapLim = 6
        self.StopNewsWords = ["Boston", "day", "time", "love", "today", "Boston-MA"]
        # Set TIME_FRAME
        self.SetStart(kwargs.get("TIME_START", time.gmtime(0)))

        # Storage variables for analysis
        self.Storage = []
        self.StorageDict = pd.DataFrame(
            columns=["word", "Poisson", "LocalEntropy", "GlobalEntropy", "start_time", "event"]
        )
        self.ResultDict = pd.DataFrame(columns=["word", "event_time", "location", "discovered_time", "summary"])

        # Classifier
        self.matrix_w, self.scaler, self.clf = cPickle.load(open("SVClassifier.Store"))

        # Verbosity - 1. Print all messages 2. Print less messages 3. .....
        self.VerboseLevel = kwargs.get("VerboseLevel", 3)

Пример #3

Показать файл

Файл: NSWW.py Проект: shkr/tweet-event

  def __init__(self,db,timeWindow=60*10,**kwargs):

    print "COLLECTING TWEETS...."
    self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
    print "COLLECTION OVER...."

    #Variables
    self.QueueStack     = []
    self.Candidates    = {}
    self.Vocabulary		= []

    #Constants
    self.delta    				 = 3  #GaussianDistortion
    self.MinWordSamples     = 15.0 #Has to be greater than 8 See SetFeatureTable method for this restriction

    self.QueueLim           = 6  #MaximumQueueLimit

    self.StopNewsWords     = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA']  #Default StopWordList

    #Set TIME_FRAME
    self.SetStart(kwargs.get("TIME_START",time.gmtime(0)))

    #Storage variables for analysis
    self.FeatureDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event'])
    self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary'])

    #Classifier
    self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store'))

    #Verbosity - 1. Print all messages 2. Print less messages 3. .....
    self.OnlyMessage = kwargs.get('OnlyMessage',0)

Пример #4

Показать файл

Файл: visualization.py Проект: shkr/tweet-event

def set_SnapIter(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  return TS

Пример #5

Показать файл

Файл: visualization.py Проект: shkr/tweet-event

def visualize_timeframe(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  VocabSize   = kwargs.get("VocabSize",500)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  #Create Dataframe
  df             = pd.DataFrame(columns=['Virality','Locality','Volume','Words','TimeWindow'])

  while (TS.time_start<TIME_END and not TS.end):

    #InitializeColumns
    Virality = {}
    Volume   = {}
    Locality = {}
    TimeWindow=[]

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = te(snap)
    #2. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality = GeographicalEntropy(snap,gmm.labels)
    #3. Volume
    Volume   = Count(snap)

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    Words= list(set(dict(Virality.most_common(VocabSize)).keys())&set(dict(Locality.most_common(VocabSize)).keys())&set(dict(Volume.most_common(VocabSize)).keys()))
    if not len(Words)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in Words]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in Words]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in Words]

    #4. TimeWindow
    TimeWindow= [snap['TimeWindow'][0]]*len(Words)
    #5. Words
    Words= Words

    #Append to Dataframe
    df = df.append({'Virality':Virality,'Locality':Locality,'Volume':Volume,'Words':Words,'TimeWindow':TimeWindow},ignore_index=True)

  return df

Пример #6

Показать файл

Файл: visualization.py Проект: shkr/tweet-event

def newsworthy_words(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",25)


  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)


  Day = {}

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = PoissonRate(snap)
    #2. DeltaVolume
    #Broadcast = DeltaVolume(snap0,snap)
    #3. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality  = GeographicalEntropy(snap,gmm.labels)
    #4. Prevalence
    #Prevalence= Ttest(snap)
    #5. Count
    Volume     = Count(snap)

    #Prepare Dataframe
    #Union
    #HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys()+dict(Broadcast.most_common(HotWordSize)).keys()+dict(Locality.most_common(HotWordSize)).keys()+dict(Prevalence.most_common(HotWordSize)).keys()+dict(Volume.most_common(HotWordSize)).keys()))
    #Intersection
    #print "Simmering words"
    #print 'Virality',set(dict(Virality.most_common(HotWordSize)).keys())
    #print 'Broadcast',set(dict(Broadcast.most_common(HotWordSize)).keys())
    #print 'Locality',set(dict(Locality.most_common(HotWordSize)).keys())
    #print set(dict(Prevalence.most_common(HotWordSize)).keys())
    #print 'Volume',set(dict(Volume.most_common(HotWordSize)).keys())
    #print "*"*5

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()))
    if not len(HotWords)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in HotWords]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in HotWords]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in HotWords]

    #scaler           = preprocessing.MinMaxScaler([0,100]).fit_transform
    #scaledVirality   = list(scaler(np.array([Virality]).T).flatten())
    # scaledBroadcast  = scaler(Broadcast)
    #scaledLocality   = list(scaler(np.array([Locality]).T).flatten())
    # scaledPrevalence = scaler(Prevalence)
    #scaledVolume     = list(scaler(np.array([Volume],dtype=np.float16).T).flatten())
    Score            = [vi+lo+vo for vi,lo,vo in zip(Virality,Locality,Volume)]

    df             = pd.DataFrame({'Words':HotWords,'Virality':Virality,'Locality':Locality,'Volume':Volume,'Score':Score})
    #df_scaled      = pd.DataFrame({'Words':HotWords,'Virality':scaledVirality,'Locality':scaledLocality,'Volume':scaledVolume,'Score':Score})

    Day['to'.join(snap['TimeWindow'])]=df

  return Day

Пример #7

Показать файл

Файл: visualization.py Проект: shkr/tweet-event

def print_vocabulary_report(db,scale=60*20,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = scale,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",8)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-scale)

  volume = []
  HotWordsList = []
  ColorGradient = {}
  TweetCountDict    = {}
  TimeList      = []

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    timeWindow = gmt_to_local(TS.time_start,make_string=True,format='%a %H:%M')
    #Volume of tweets
    volume.append(len(snap['LOC']))

    #HotWords List
    Vocab_dict = dict(get_vocabulary(snap['TEXT']).most_common(HotWordSize))
    TimeList.append(timeWindow)

    ColorGradient[timeWindow] = {}

    for word in Vocab_dict.keys():
      ColorGradient[timeWindow][word] = Vocab_dict[word]/float(sum(Vocab_dict.values()))
      if word in TweetCountDict.keys():
        TweetCountDict[word] += Vocab_dict[word]
      else:
        TweetCountDict[word] = Vocab_dict[word]
    print "LOOPING2"



  SortedTweetCount = sorted(TweetCountDict.iteritems(),key=operator.itemgetter(1))
  WordList         = [item[0] for item in SortedTweetCount]
  TweetCountArray = np.array([item[1] for item in SortedTweetCount],dtype=int)
  del SortedTweetCount


  ColorMap = np.empty([len(WordList),len(TimeList)],dtype=float)

  for rw,word in enumerate(WordList):
    for cl,timeWindow in enumerate(TimeList):
      if word in ColorGradient[timeWindow].keys():
        ColorMap[rw][cl] = ColorGradient[timeWindow][word]
      else:
        ColorMap[rw][cl] = 0

  ###PRINT RESULTS
  gs      = gridspec.GridSpec(2,2,width_ratios=[1,2],height_ratios=[1,4])
  gs.update(left=0.05,right=0.48,wspace=0.00000000000000000000000000000000000000005,hspace=0.00000000000000000000000000000000000000005)

  fig1    = plt.figure(figsize=(36,90),dpi=200)


  ax0     = fig1.add_subplot(gs[0,1])
  ax1     = fig1.add_subplot(gs[1,1])
  ax2     = fig1.add_subplot(gs[1,0])
  ax3     = fig1.add_subplot(gs[0,0])

  #TweetVolume
  ax0.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax0.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax0.set_axis_bgcolor('0.95')

  ASCII_WordList = [ word.encode('ascii','ignore') for word in WordList ]
  ax0.plot(np.arange(len(TimeList)),volume,label='NumberOfTweets',linewidth=0.75)
  ax0.legend(loc='upper left',ncol=4)
  ax0.set_xlim(0,len(TimeList)-1)
  ax0.xaxis.tick_top()
  ax0.yaxis.tick_right()
  ax0.set_xticks(np.arange(0,len(TimeList),5))
  ax0.set_xticklabels(TimeList,rotation='vertical')

  #HotWordColorMap
  ax1.imshow(ColorMap,cmap=plt.cm.binary,vmin=ColorMap.min(),vmax=ColorMap.max(),aspect='auto',origin='lower')
  ax1.yaxis.tick_right()
  ax1.set_yticks(np.arange(len(WordList)))
  ax1.set_yticklabels(WordList)
  ax1.set_xticks(np.arange(0,len(TimeList),5))
  ax1.set_xticklabels(TimeList,rotation='vertical')

  ax1.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax1.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)

  #TweetVolumeDistributionOverHotWords
  ax2.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax2.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax2.set_axis_bgcolor('0.95')

  ax2.invert_xaxis()
  ax2.barh(np.arange(len(WordList)),TweetCountArray,align='center')

  #add the numbers to the side of each bar
  PreviousValue = None
  for p, ch in zip(np.arange(len(WordList)), TweetCountArray):
      if ch!=PreviousValue:
        ax2.annotate(str(ch), xy=(ch + 2.5, p - 0.25), va='center')
        PreviousValue = ch
      else:
        continue


  ax2.set_yticks(np.arange(len(WordList)))
  ax2.set_yticklabels(WordList)#,rotation='horizontal')
  ax2.set_ylim(0,len(WordList)-1+0.25)

  #Plot table with assisting information
  #1. Date : Day, Date Year and TIME_START to TIME_END
  #2. TIME_START
  #3. TIME_END
  #4. TIME_WINDOW
  #5. No. of HotWords per TimeWindow
  #6. Total No. of unique HotWords Found
  #7. Max #of Tweets for HotWord & HotWord
  #8. Min #of Tweets for HotWord & HotWord
  #9. Max #of Tweets in a timeWindow & timeWindow
  #10.Mix #of Tweets in a timeWindow & timeWindow

  rowLabels = ['1. Date','2. Start time','3. End time','4. Time Window (seconds)','5. No.Of HotWords per TimeWindow','6. No. of unique hotwords','7. Max #of tweets for HotWord','8. Min #of tweets for HotWord','9. Max #of tweets in a time window','10. Min #of tweets in a time window']
  DateStart = gmt_to_local(TIME_START,make_string=True,format='%a %d %b %Y')
  DateEnd   = gmt_to_local(TIME_END,make_string=True,format='%a %d %b %Y')
  Date      = DateStart if DateStart==DateEnd else DateStart+' to '+DateEnd
  start_time= gmt_to_local(TIME_START,make_string=True,format='%d %b %H:%M')
  end_time  = gmt_to_local(TIME_END,make_string=True,format='%d %b %H:%M')
  cellText  = [Date,start_time,end_time,scale,HotWordSize,len(set(WordList)),TweetCountArray.max(),TweetCountArray.min(),str(max(volume)),str(min(volume))]
  rowLabels.reverse()
  cellText.reverse()
  colLabels = ['Value']
  for y, label, text in zip(range(len(cellText)),rowLabels,cellText):
    ax3.text(0.05,(float(y)/20)+0.05,s='%s : %s'%(label,text),size=20)
  ax3.xaxis.set_visible(False)
  ax3.yaxis.set_visible(False)

  fig1.savefig('%s_to_%spng'%(start_time,end_time),dpi=200,bbox_inches="tight")
  plt.close(fig1)

Пример #8

Показать файл

Файл: algo.py Проект: shkr/tweet-event

class NewsWorthyWords:
    def __init__(self, db, timeWindow=60 * 10, **kwargs):

        print "COLLECTING TWEETS...."
        self.TS = TweetSnap(db=db, timeWindow=timeWindow, Placename2Geocode=False)
        print "COLLECTION OVER...."

        # Variables
        self.SnapStack = []
        self.Candidates = {}
        self.Volume = []

        # Constants
        self.delta = 1.5
        self.enoughSamples = 15.0
        self.SnapLim = 6
        self.StopNewsWords = ["Boston", "day", "time", "love", "today", "Boston-MA"]
        # Set TIME_FRAME
        self.SetStart(kwargs.get("TIME_START", time.gmtime(0)))

        # Storage variables for analysis
        self.Storage = []
        self.StorageDict = pd.DataFrame(
            columns=["word", "Poisson", "LocalEntropy", "GlobalEntropy", "start_time", "event"]
        )
        self.ResultDict = pd.DataFrame(columns=["word", "event_time", "location", "discovered_time", "summary"])

        # Classifier
        self.matrix_w, self.scaler, self.clf = cPickle.load(open("SVClassifier.Store"))

        # Verbosity - 1. Print all messages 2. Print less messages 3. .....
        self.VerboseLevel = kwargs.get("VerboseLevel", 3)

    def verbose(self, text, level=1):
        if level < self.VerboseLevel:
            return
        else:
            print text

    def SetStart(self, TIME_START):
        if isinstance(TIME_START, str):
            TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START, "%d %b %H:%M %Z %Y")))
        TIME_DIFF = time.mktime(TIME_START) - time.mktime(self.TS.time_start)
        if TIME_DIFF > 0:
            self.TS.move_on(TIME_DIFF)

    def run(self):

        while not self.TS.end:

            # Update SnapStack
            if len(self.SnapStack) == self.SnapLim:
                self.SnapStack = self.SnapStack[1:]
                self.Volume = self.Volume[1:]
            self.SnapStack.append(self.TS.next())
            self.Volume.append(Count(self.SnapStack[-1]))

            # Update Candidates origin snap as timeWindow has shifted right
            for key, val in self.Candidates.items():
                if val == -self.SnapLim:
                    self.Candidates.pop(key)
                    self.verbose("This %s word has been removed because it never received enough samples" % key)
                else:
                    self.Candidates[key] = val - 1

            print ("Latest timeWindow %s" % self.SnapStack[-1]["TimeWindow"], 2)
            # Algorithm
            self.verbose("Print looking for new events which happened in this timeWindow", 2)
            self.FindNewEvent()

            self.verbose("Print confirming old/new candidate events which have not been published")
            self.ConfirmEvent()

            if self.Candidates.keys() != []:
                self.verbose("EventCandidates: %s" % self.Candidates.keys(), 2)

    def TotalVolume(self, word, Volume):

        total = 0.0
        k = 0

        while k < len(Volume):
            if word in Volume[k].keys():
                total += Volume[k][word]
            k += 1

        return total if total != 0 else 1

    def FindNewEvent(self):

        for word, count in self.Volume[-1].items():

            # Is word count gaussian noise or signal ?
            wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys()]
            mean = np.mean(wordHistory) if len(wordHistory) > 0 else 1
            var = np.std(wordHistory) if len(wordHistory) >= 5 else 1

            std_score = (count - mean) / (2 * var)

            if std_score >= self.delta and (word not in self.StopNewsWords):

                self.verbose("This %s is not gaussian noise with standard_score = %f " % (word, std_score))
                if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word] < count):
                    self.Candidates[word] = -1

    def ConfirmEvent(self):

        for word, no in self.Candidates.items():

            wordHistory = [float(vol.get(word, 0.0)) for vol in self.Volume[no:]]
            self.verbose(
                "Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d"
                % (word, self.SnapStack[no]["TimeWindow"][0], sum(wordHistory), no),
                2,
            )
            if sum(wordHistory) >= self.enoughSamples:
                self.verbose(
                    "This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)"
                    % (word),
                    2,
                )
                # Poisson
                Poisson = self.FitPoissonDistribution(word, no)
                # Global and Local Entropy
                GlobalEntropy, LocalEntropy = self.FitSpatialEntropy(word, no)

                # Classifier
                # Define feature vector
                X = np.array([Poisson, LocalEntropy, GlobalEntropy], dtype=np.float64)
                # Apply Scaler
                X_sc = self.scaler.transform(X)
                # Apply Orthogonality
                X_tr = X_sc.dot(self.matrix_w)
                # Classify new transformed feature vector
                Flag = self.clf.predict(X_tr)[0]

                if Flag == 1:
                    start_time = self.SnapStack[no]["TimeWindow"][0]
                    confirmed_time = self.SnapStack[-1]["TimeWindow"][0]
                    SampleSet = self.ReportEventQueue(word, no)
                    print "Newsword (%s) at %s confirmed at %s\n" % (word, start_time, confirmed_time)
                    print "Summary : "
                    summary = []
                    for user, created_at, tweet, loc in SampleSet:
                        print "%s reported at time %s near %s: %s" % (user, created_at, loc, tweet)
                        # summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
                        summary.append([user, created_at, tweet, loc])

                    event = {
                        "word": word,
                        "event_time": start_time,
                        "location": GetPlaceName(
                            np.mean([item[3][0] for item in summary]), np.mean([item[3][1] for item in summary])
                        ),
                        "discovered_time": confirmed_time,
                        "summary": "\n".join(
                            ["%s reported at time %s near %s: %s" % (item[0], item[1], item[3], item[2])]
                        ),
                    }
                    print event
                    self.ResultDict = self.ResultDict.append(event, ignore_index=True)
                    self.Candidates.pop(word)

                else:
                    continue

                # Store Data for post-classification
                self.StorageDict = self.StorageDict.append(
                    {
                        "word": word,
                        "Poisson": Poisson,
                        "LocalEntropy": LocalEntropy,
                        "GlobalEntropy": GlobalEntropy,
                        "start_time": start_time,
                        "event": event,
                    },
                    ignore_index=True,
                )

                # Manual Classifier
                # if flag in ['1','y','yes']:
                # 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
                # 		self.ReportEventQueue(word,no)
                # 		self.Candidates.pop(word)
                # else:
                # 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

    def FitSpatialEntropy(self, word, no):

        k = no
        tokenize = T_Tokenizer().tokenize
        # Store locations
        ALLLOC = []
        WORDLOC = []

        while k < 0:

            ALLLOC += self.SnapStack[k]["LOC"]
            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    WORDLOC.append(self.SnapStack[k]["LOC"][order])

            k += 1

        # Choose Cluster of max ALLLOC, C*
        MakeCluster = GMM_clustering()
        MakeCluster.Snap = {"LOC": ALLLOC}
        MakeCluster.build_clusters()
        WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

        # Global entropy
        GLOBAL_COUNTER = Counter(MakeCluster.labels)
        G_D_pq = 0.0
        for cl, number in WORDLABELS.items():
            G_D_pq += -1 * (number / float(GLOBAL_COUNTER[cl])) * np.log2(number / float(GLOBAL_COUNTER[cl]))
            # G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))

        C_Star = WORDLABELS.most_common(1)[0][0]
        C_Star_LOC = [ALLLOC[No] for No, label in filter(lambda (enum, x): x == C_Star, enumerate(MakeCluster.labels))]
        C_Star_WORD_LOC = [LOC for LOC in filter(lambda x: x in C_Star_LOC, WORDLOC)]

        # Find D(p||q) of word inside C*
        del MakeCluster
        MakeLocalCluster = GMM_clustering(components=range(2, 8))
        MakeLocalCluster.Snap = {"LOC": C_Star_LOC}
        MakeLocalCluster.build_clusters()

        WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
        LOCAL_ALL_COUNTER = Counter(MakeLocalCluster.labels)
        L_D_pq = 0.0
        for cl, number in WORD_LOCAL_COUNTER.items():
            L_D_pq += -1 * (number / float(LOCAL_ALL_COUNTER[cl])) * np.log2(number / float(LOCAL_ALL_COUNTER[cl]))
            # L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

        return [G_D_pq, L_D_pq]

    def FitStdDev(self, word, no):

        k = no
        tokenize = T_Tokenizer().tokenize
        # Store locations
        WORDLOC = []

        while k < 0:
            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    WORDLOC.append(self.SnapStack[k]["LOC"][order])
            k += 1

        return np.std(WORDLOC)

    def FitPoissonDistribution(self, word, no):

        tokenize = T_Tokenizer().tokenize

        k = no
        Times = []

        ApproxTimes = []

        wordHistory = [vol.get(word, 0) for vol in self.Volume[no:]]

        # Store all tweet_times with word in current snap and known history
        while k < 0:

            approx = time.mktime(time.strptime(self.SnapStack[k]["TimeWindow"][0] + "2014EDT", "%d%b%HHR%MMN%Y%Z"))
            count = self.Volume[k].get(word, 0)
            ApproxTimes += [approx] * count

            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    Times.append(
                        time.mktime(time.strptime(self.SnapStack[k]["CREATED_AT"][order], "%d %b %H:%M:%S %Y"))
                    )
            k += 1

        # Calculate time-intervals
        TimeIntervals = [Time - min(Times) for Time in Times]
        ApproxTimeIntervals = sorted([approx - min(ApproxTimes) for approx in ApproxTimes])
        TimeIntervals.sort()
        self.verbose("Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)")
        self.verbose("(1) %s" % TimeIntervals)
        self.verbose("(2) %s" % ApproxTimeIntervals)

        ApproxTimeIntervals = Counter(ApproxTimeIntervals)

        # Calculate ML_Lmbda
        _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals)
        if sum(ApproxTimeIntervals) != 0:
            _lmbda = float(len(ApproxTimeIntervals)) / sum(ApproxTimeIntervals)
        else:
            _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals)

        return _lmbda

    def ReportEventQueue(self, word, no, SampleLim=3):

        # Find clusters at start point of event
        gmm = GMM_clustering(components=range(4, 15))
        gmm.Snap = self.SnapStack[no]
        gmm.build_clusters()
        Labels = []
        tokenize = T_Tokenizer().tokenize
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if word in tokenize(text):
                Labels.append(gmm.labels[k])
        Labels = Counter(Labels)
        # Find cluster where word was most common
        StarLabel = Labels.most_common(1)[0][0]

        SampleSet = []
        # Print a tweet from that cluster
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if gmm.labels[k] == StarLabel and word in tokenize(text):
                SampleSet.append((gmm.Snap["SCREEN_NAME"][k], gmm.Snap["CREATED_AT"][k], text, gmm.Snap["LOC"][k]))
            if len(SampleSet) >= SampleLim:
                break

        return SampleSet

Пример #9

Показать файл

Файл: semantic.py Проект: shkr/tweet-event

class NewsWorthyWords:

	def __init__(self,db,timeWindow=60*10,**kwargs):

		print "COLLECTING TWEETS...."
		self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
		print "COLLECTION OVER...."

		#Variables
		self.SnapStack = []
		self.Candidates= {}
		self.Volume		= []

		#Constants
		self.delta    				 = 1.5
		self.enoughSamples     = 15.0
		self.SnapLim           = 6
		self.StopNewsWords     = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA']
		#Set TIME_FRAME
		self.SetStart(kwargs.get("TIME_START",time.gmtime(0)))

		#Storage variables for analysis
		self.Storage = []
		self.StorageDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event'])
		self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary'])

		#Classifier
		self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store'))

		#Verbosity - 1. Print all messages 2. Print less messages 3. .....
		self.VerboseLevel = kwargs.get('VerboseLevel',1)

	def verbose(self,text,level=1):
		if level<self.VerboseLevel:
			return
		else:
			print text

	def SetStart(self,TIME_START):
		if isinstance(TIME_START,str):
			TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
		TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(self.TS.time_start)
		if TIME_DIFF>0:
			self.TS.move_on(TIME_DIFF)

	def run(self):

		while not self.TS.end:

			#Update SnapStack
			if len(self.SnapStack)==self.SnapLim:
				self.SnapStack = self.SnapStack[1:]
				self.Volume    = self.Volume[1:]
			self.SnapStack.append(self.TS.next())
			self.Volume.append(Count(self.SnapStack[-1]))

			#Update Candidates origin snap as timeWindow has shifted right
			for key,val in self.Candidates.items():
				if val==-self.SnapLim:
					self.Candidates.pop(key)
					self.verbose('This %s word has been removed because it never received enough samples'%key)
				else:
					self.Candidates[key]=val-1


			print('Latest timeWindow %s'%self.SnapStack[-1]['TimeWindow'],2)
			#Algorithm
			self.verbose('Print looking for new events which happened in this timeWindow',2)
			self.FindNewEvent()

			self.verbose('Print confirming old/new candidate events which have not been published')
			self.ConfirmEvent()


			if self.Candidates.keys() !=[]: self.verbose('EventCandidates: %s'%self.Candidates.keys(),2);



	def TotalVolume(self,word,Volume):

		total = 0.0
		k		 = 0

		while k < len(Volume):
			if word in Volume[k].keys():
				total += Volume[k][word]
			k+=1

		return total if total!=0 else 1


	def FindNewEvent(self):

		for word,count in self.Volume[-1].items():

				#Is word count gaussian noise or signal ?
				wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys() ]
				mean        =  np.mean(wordHistory) if len(wordHistory)>0 else 1
				var				 =  np.std(wordHistory) if len(wordHistory)>=5 else 1

				std_score = (count - mean)/(2*var)

				if std_score>=self.delta and (word not in self.StopNewsWords):

					self.verbose('This %s is not gaussian noise with standard_score = %f '%(word,std_score))
					if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word]<count):
						self.Candidates[word] = -1

	def ConfirmEvent(self):

		for word,no in self.Candidates.items():

			wordHistory = [float(vol.get(word,0.0)) for vol in self.Volume[no:]]
			self.verbose('Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d'%(word,self.SnapStack[no]['TimeWindow'][0],sum(wordHistory),no),2)
			if sum(wordHistory)>=self.enoughSamples:
				self.verbose('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word),2)
				#Poisson
				Poisson = self.FitPoissonDistribution(word,no)
				#Global and Local Entropy
				GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no)

				#Classifier
				#Define feature vector
				X    = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64)
				#Apply Scaler
				X_sc = self.scaler.transform(X)
				#Apply Orthogonality
				X_tr = X_sc.dot(self.matrix_w)
				#Classify new transformed feature vector
				Flag = self.clf.predict(X_tr)[0]

				if Flag==1:
					start_time  = self.SnapStack[no]['TimeWindow'][0]
					confirmed_time = self.SnapStack[-1]['TimeWindow'][0]
					SampleSet   = self.ReportEventQueue(word,no)
					print       "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time)
					print       "Summary : "
					summary     = []
					for user,created_at,tweet,loc in SampleSet:
						print "%s reported at time %s near %s: %s"%(user,created_at,GetPlaceName(loc[0],loc[1]),tweet)
						#summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
						summary.append([user,created_at,tweet,loc])

					event =  {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],GetPlaceName(item[3][0],item[3][1]),item[2]) for item in summary])}
					print event
					self.ResultDict = self.ResultDict.append(event,ignore_index=True)
					self.Candidates.pop(word)

				else:
					continue



				#Store Data for post-classification
				self.StorageDict = self.StorageDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True)


				#Manual Classifier
				# if flag in ['1','y','yes']:
				# 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
				# 		self.ReportEventQueue(word,no)
				# 		self.Candidates.pop(word)
				# else:
				# 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

	def FitSpatialEntropy(self,word,no):

		k = no
		tokenize  = T_Tokenizer().tokenize
		#Store locations
		ALLLOC = []
		WORDLOC = []

		while k<0:

			ALLLOC += self.SnapStack[k]['LOC']
			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					WORDLOC.append(self.SnapStack[k]['LOC'][order])

			k+=1

		#Choose Cluster of max ALLLOC, C*
		MakeCluster 	 	= GMM_clustering()
		MakeCluster.Snap = {'LOC':ALLLOC}
		MakeCluster.build_clusters()
		WORDLABELS       = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

		#Global entropy
		GLOBAL_COUNTER = Counter(MakeCluster.labels)
		G_D_pq		   = 0.0
		for cl,number in WORDLABELS.items():
				G_D_pq	+= -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl]))
				#G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))


		C_Star					 = WORDLABELS.most_common(1)[0][0]
		C_Star_LOC       = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
		C_Star_WORD_LOC  = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)]

		#Find D(p||q) of word inside C*
		del MakeCluster
		MakeLocalCluster 	 	= GMM_clustering(components=range(2,8))
		MakeLocalCluster.Snap = {'LOC':C_Star_LOC}
		MakeLocalCluster.build_clusters()

		WORD_LOCAL_COUNTER    = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
		LOCAL_ALL_COUNTER		 = Counter( MakeLocalCluster.labels )
		L_D_pq		   = 0.0
		for cl,number in WORD_LOCAL_COUNTER.items():
			  L_D_pq	+= -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))
				#L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

		return [G_D_pq,L_D_pq]

	def FitStdDev(self,word,no):

		k = no
		tokenize  = T_Tokenizer().tokenize
		#Store locations
		WORDLOC= []

		while k<0:
			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					WORDLOC.append(self.SnapStack[k]['LOC'][order])
			k+=1

		return np.std(WORDLOC)

	def FitPoissonDistribution(self,word,no):

		tokenize  = T_Tokenizer().tokenize

		k = no
		Times = []

		ApproxTimes = []

		wordHistory = [vol.get(word,0) for vol in self.Volume[no:]]

		#Store all tweet_times with word in current snap and known history
		while k<0:

			approx = time.mktime(time.strptime(self.SnapStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z"))
			count  = self.Volume[k].get(word,0)
			ApproxTimes+=[approx]*count

			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					Times.append(\
									time.mktime(time.strptime(self.SnapStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y")))
			k+=1

	  #Calculate time-intervals
		TimeIntervals = [Time-min(Times) for Time in Times]
		ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes])
		TimeIntervals.sort()
		self.verbose('Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)')
		self.verbose('(1) %s'%TimeIntervals)
		self.verbose('(2) %s'%ApproxTimeIntervals)

		ApproxTimeIntervals = Counter(ApproxTimeIntervals)

		#Calculate ML_Lmbda
		_lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)
		# if sum(ApproxTimeIntervals)!=0:
		# 	_lmbda      = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals)
		# else:
		# 	_lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)

		#Calculate Variance for given samples
		# _R2         = 1/_lmbda**2

		#Likelihood calculation and plotting (optional)

		# MaxLogLikelihood
		# _LgLd 			= -1*sum([np.log(_lmbda*np.exp(-_lmbda*x)) for x in TimeIntervals])
		# print '(3)',_LgLd
		#
		# #Simulate a expon_RV with fitted _lmbda
		# _rv         = expon(scale=1/_lmbda)
		#
		# #Plot pdf of counts from _rv and known
		# fig = plt.figure()
		# ax  = fig.add_subplot(111)
		# ax.plot(sorted(ApproxTimeIntervals.keys()),[_rv.cdf(x+600)-_rv.cdf(x) for x in sorted(ApproxTimeIntervals.keys())],'r-',label='fitted')
		# ax.plot(sorted(ApproxTimeIntervals.keys()),[float(ApproxTimeIntervals[key])/sum(wordHistory) for key in sorted(ApproxTimeIntervals.keys()) ],'b-'\
		# 				,label='empirical estimate')
		#
		# plt.legend()
		#
		# #save figure
		# fig.savefig('%s.png'%word)
		#
		# gmm  = GMM_clustering(components=range(4,15))
		# gmm.Snap = self.SnapStack[no]
		# gmm.build_clusters()
		#
		# #flag = raw_input("Fitted curve for %s stored should flag=1 or not with lambda=%f and locality=%f"%(word,_lmbda,Locality(self.SnapStack[no],gmm.labels,word)))
		# plt.close(fig)

		return _lmbda

	def ReportEventQueue(self,word,no,SampleLim=3):

		#Find clusters at start point of event
		gmm  = GMM_clustering(components=range(4,15))
		gmm.Snap = self.SnapStack[no]
		gmm.build_clusters()
		Labels = []
		tokenize  = T_Tokenizer().tokenize
		for k,text in enumerate(gmm.Snap['TEXT']):
			if word in tokenize(text):
				Labels.append(gmm.labels[k])
		Labels = Counter(Labels)
		#Find cluster where word was most common
		StarLabel = Labels.most_common(1)[0][0]

		SampleSet = []
		#Print a tweet from that cluster
		for k,text in enumerate(gmm.Snap['TEXT']):
			if gmm.labels[k] == StarLabel and word in tokenize(text):
				SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k]))
			if len(SampleSet)>=SampleLim:
				break

		return SampleSet

Пример #10

Показать файл

Файл: NSWW.py Проект: shkr/tweet-event

class NewsWorthyWords:

  def __init__(self,db,timeWindow=60*10,**kwargs):

    print "COLLECTING TWEETS...."
    self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
    print "COLLECTION OVER...."

    #Variables
    self.QueueStack     = []
    self.Candidates    = {}
    self.Vocabulary		= []

    #Constants
    self.delta    				 = 3  #GaussianDistortion
    self.MinWordSamples     = 15.0 #Has to be greater than 8 See SetFeatureTable method for this restriction

    self.QueueLim           = 6  #MaximumQueueLimit

    self.StopNewsWords     = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA']  #Default StopWordList

    #Set TIME_FRAME
    self.SetStart(kwargs.get("TIME_START",time.gmtime(0)))

    #Storage variables for analysis
    self.FeatureDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event'])
    self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary'])

    #Classifier
    self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store'))

    #Verbosity - 1. Print all messages 2. Print less messages 3. .....
    self.OnlyMessage = kwargs.get('OnlyMessage',0)

  def message(self,text):
    if self.OnlyMessage:
      print text
    else:
      pass

  def SetStart(self,TIME_START):
    if isinstance(TIME_START,str):
      TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
    TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(self.TS.time_start)
    if TIME_DIFF>0:
      self.TS.move_on(TIME_DIFF)

  def run(self):

    while not self.TS.end:

      #Update QueueStack
      if len(self.QueueStack)==self.QueueLim:
        self.QueueStack = self.QueueStack[1:]
        self.Vocabulary    = self.Vocabulary[1:]
      self.QueueStack.append(self.TS.next())
      self.Vocabulary.append(Count(self.QueueStack[-1]))

      #Update Candidates origin snap as timeWindow has shifted right
      for key,val in self.Candidates.items():
        if val==-self.QueueLim:
          self.Candidates.pop(key)
          self.message('This %s word has been removed because it never received enough samples'%key)
        else:
          self.Candidates[key]=val-1


      print('Latest timeWindow %s'%self.QueueStack[-1]['TimeWindow'])
      #Algorithm
      #1. Add to candidates list
      self.FilterWords()
      #1.1
      if self.TableON==1 and len(self.Candidates.keys())!=0:
        self.SetFeatureTable()
      #2. Find news-word in candidate list
      self.ConfirmEvent()
      #Status of candidate list
      if self.Candidates.keys() !=[]: self.message('EventCandidates: %s'%self.Candidates.keys());

  def FilterWords(self):

    for word,count in self.Vocabulary[-1].items():

        #Is word count gaussian noise or signal ?
        wordHistory = [float(vol[word]) for vol in self.Vocabulary[:-1] if word in vol.keys() ]
        mean        =  np.mean(wordHistory) if len(wordHistory)>0 else 1
        variance 	 =  np.std(wordHistory) if len(wordHistory)>=5 else 1

        Z_score = (count - mean)/variance

        if Z_score>=self.delta and (word not in self.StopNewsWords):

          self.message('This %s is not gaussian noise with standard_score = %f '%(word,Z_score))
          if word not in self.Candidates.keys() or (self.Vocabulary[self.Candidates[word]][word]<count):
            self.Candidates[word] = -1

  def ConfirmEvent(self):

    for word,no in self.Candidates.items():

      wordHistory = [float(vol.get(word,0.0)) for vol in self.Vocabulary[no:]]

      self.message('Confirming candidate Newsword : %s at time = %s with samples=%d and Queueno=%d'%(word,self.QueueStack[no]['TimeWindow'][0],sum(wordHistory),no))

      if sum(wordHistory)>=self.MinWordSamples:

        self.message('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word))

        #Poisson
        Poisson = self.FitPoissonDistribution(word,no)
        #Global and Local Entropy
        GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no)

        #Poisson, GlobalEntropy, LocalEntropy = self.GetFeatures(word,no)

        #Classifier
        #Define feature vector
        X    = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64)
        #Apply Scaler
        X_sc = self.scaler.transform(X)
        #Apply Orthogonality
        X_tr = X_sc.dot(self.matrix_w)
        #Classify new transformed feature vector
        Flag = self.clf.predict(X_tr)[0]

        if Flag==1:
          start_time  = self.QueueStack[no]['TimeWindow'][0]
          confirmed_time = self.QueueStack[-1]['TimeWindow'][0]
          SampleSet   = self.ReportEventQueue(word,no)
          print       "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time)
          print       "Summary : "
          summary     = []
          for user,created_at,tweet,loc in SampleSet:
            print "%s reported at time %s near %s: %s"%(user,created_at,loc,tweet)
            #summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
            summary.append([user,created_at,tweet,loc])

          event =  {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],item[3],item[2]) for item in summary])}
          print event
          self.ResultDict = self.ResultDict.append(event,ignore_index=True)
          self.Candidates.pop(word)

        else:
          continue



        #Store Data for post-classification
        self.FeatureDict = self.FeatureDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True)


        #Manual Classifier
        # if flag in ['1','y','yes']:
        # 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
        # 		self.ReportEventQueue(word,no)
        # 		self.Candidates.pop(word)
        # else:
        # 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

  def SetFeatureTable(self):

    tokenize  = T_Tokenizer().tokenize
    self.Feature = {}
    k = -len(self.QueueStack)

    #Store locations
    ALL_LOC  = []
    WORD_LOC = {}
    C_Star_LOC = {}
    C_Star_Labels = {}

    #Get List of locations of all tweets Collected : ALL_LOC
    #Get List of locations where "word" appears in tweets posted after it was declared as an event
    #    : WORD_LOC[word]
    while k<0:
       ALL_LOC += self.QueueStack[k]['LOC']
       for order,text in enumerate(self.QueueStack[k]['TEXT']):
         for word,no in self.Candidates.items():
           if word in tokenize(text) and order>=no:
             WORD_LOC.setdefault(word,[]).append(self.QueueStack[k]['LOC'][order])

       k+=1

    #Global Clustering
    MakeCluster 	 	= GMM_clustering(components=range(3,8))
    MakeCluster.Snap = {'LOC':ALL_LOC}
    MakeCluster.build_clusters()
    #Input : ALL_LOC & Output : Global labels for locations of tweets
    GLOBAL_LABELS    = Counter(MakeCluster.labels)

    #Local Clustering for each cluster in lists
    for C_Star in GLOBAL_LABELS.keys():

      #Input : C_Star_LOC ; All tweet locations withing C_Star cluster
      C_Star_LOC[C_Star]    = [ ALL_LOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
      if len(C_Star_LOC[C_Star])>=(self.MinWordSamples/3.0):
        MakeLocalCluster 	 	= GMM_clustering(components=range(2,min(8,int(self.MinWordSamples/3))))
        MakeLocalCluster.Snap = {'LOC':C_Star_LOC[C_Star]}
        MakeLocalCluster.build_clusters()

        #Output : C_Star_Labels ; Labels for All tweet locations withing C_Star cluster
        C_Star_Labels[C_Star] = MakeLocalCluster.labels

    #Set GlobalEntropy and LocalEntropy for each Candidate word
    for word,no in self.Candidates.items():

      #Global entropy
      #1. Initialize to 0
      G_D_pq 		   = 0.0
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LABELS   = Counter([MakeCluster.labels[ALL_LOC.index(LOC)] for LOC in WORD_LOC[word]])
      #3. Calculate entropy by summing up over all clusters
      for cl,number in WORD_LABELS.items():
          G_D_pq	+= -1*(number/float(GLOBAL_LABELS[cl]))*np.log2(number/float(GLOBAL_LABELS[cl]))
          #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))

      #Local entropy
      #1. Most populated cluster with 'word'
      C_Star					 = WORD_LABELS.most_common(1)[0][0]
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LOCAL_LABELS     = Counter([C_Star_Labels[C_Star][C_Star_LOC[C_Star].index(LOC)] for LOC in WORD_LOC[word] if LOC in C_Star_LOC[C_Star]])
      LOCAL_LABELS 		     = Counter( C_Star_Labels[C_Star] )
      #3. Calculate entropy by summing up over all local clusters
      L_D_pq		   = 0.0
      for cl,number in WORD_LOCAL_LABELS.items():
          L_D_pq	+= -1*(number/float(LOCAL_LABELS[cl]))*np.log2(number/float(LOCAL_LABELS[cl]))
          #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

      self.Feature[word] = [G_D_pq,L_D_pq,self.GetPoissonRate(word,no)]

  def FitSpatialEntropy(self,word,no):

    if self.TableON:
      return [self.Feature[word][0],self.Feature[word][1]]

    k = no
    tokenize  = T_Tokenizer().tokenize
    #Store locations
    ALLLOC = []
    WORDLOC = []

    while k<0:

      ALLLOC += self.QueueStack[k]['LOC']
      for order,text in enumerate(self.QueueStack[k]['TEXT']):
        if word in tokenize(text):
          WORDLOC.append(self.QueueStack[k]['LOC'][order])

      k+=1

    #Choose Cluster of max ALLLOC, C*
    MakeCluster 	 	= GMM_clustering()
    MakeCluster.Snap = {'LOC':ALLLOC}
    MakeCluster.build_clusters()
    WORDLABELS       = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

    #Global entropy
    GLOBAL_COUNTER = Counter(MakeCluster.labels)
    G_D_pq		   = 0.0
    for cl,number in WORDLABELS.items():
        G_D_pq	+= -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl]))
        #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))


    C_Star					 = WORDLABELS.most_common(1)[0][0]
    C_Star_LOC       = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
    C_Star_WORD_LOC  = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)]

    #Find D(p||q) of word inside C*
    del MakeCluster
    MakeLocalCluster 	 	= GMM_clustering(components=range(2,8))
    MakeLocalCluster.Snap = {'LOC':C_Star_LOC}
    MakeLocalCluster.build_clusters()

    WORD_LOCAL_COUNTER    = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
    LOCAL_ALL_COUNTER		 = Counter( MakeLocalCluster.labels )
    L_D_pq		   = 0.0
    for cl,number in WORD_LOCAL_COUNTER.items():
        L_D_pq	+= -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))
        #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

    return [G_D_pq,L_D_pq]


  def GetPoissonRate(self,word,no):

    tokenize  = T_Tokenizer().tokenize

    k = no
    Times = []
    ApproxTimes = []

    #Store all tweet_times with word in current snap and known history
    while k<0:

      approx = time.mktime(time.strptime(self.QueueStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z"))
      count  = self.Vocabulary[k].get(word,0)
      ApproxTimes+=[approx]*count

      for order,text in enumerate(self.QueueStack[k]['TEXT']):
        if word in tokenize(text):
          Times.append(\
                  time.mktime(time.strptime(self.QueueStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y")))
      k+=1

    #Calculate time-intervals
    TimeIntervals       = sorted([Time-min(Times) for Time in Times])
    ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes])

    #Calculate ML_Lmbda
    if sum(ApproxTimeIntervals)!=0:
      _lmbda      = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals)
    else:
      _lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)

    return _lmbda



  def ReportEventQueue(self,word,no,SampleLim=3):

    #Find clusters at start point of event
    gmm  = GMM_clustering(components=range(4,15))
    gmm.Snap = self.QueueStack[no]
    gmm.build_clusters()
    Labels = []
    tokenize  = T_Tokenizer().tokenize
    for k,text in enumerate(gmm.Snap['TEXT']):
      if word in tokenize(text):
        Labels.append(gmm.labels[k])
    Labels = Counter(Labels)
    #Find cluster where word was most common
    StarLabel = Labels.most_common(1)[0][0]

    SampleSet = []
    #Print a tweet from that cluster
    for k,text in enumerate(gmm.Snap['TEXT']):
      if gmm.labels[k] == StarLabel and word in tokenize(text):
        SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k]))
      if len(SampleSet)>=SampleLim:
        break

    return SampleSet