Exemplo n.º 1
0
def visualize_word(db,timeWindow,given_word,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  Virality = []
  Volume   = []
  Locality = []
  TimeWindow=[]
  Word     = []

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    #gmm  = GMM_clustering()
    for item in given_word:
      #1. Virality
      #Virality.append(PoissonRate(snap,given_word=item))
      #2. Locality
      #gmm.Snap = snap
      #gmm.build_clusters()
      #Locality.append(GeographicalEntropy(snap,gmm.labels,given_word=item))
      #3. Volume
      Volume.append(Count(snap,given_word=item))
      #4. TimeWindow
      TimeWindow.append(snap['TimeWindow'][0])
      #5. Word
      Word.append(item)

  #Prepare Dataframe
  df             = pd.DataFrame({'Virality':Virality,'Locality':Locality,'Volume':Volume,'TimeWindow':TimeWindow,'Word':Word})

  return df
Exemplo n.º 2
0
def visualize_timeframe(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  VocabSize   = kwargs.get("VocabSize",500)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)

  #Create Dataframe
  df             = pd.DataFrame(columns=['Virality','Locality','Volume','Words','TimeWindow'])

  while (TS.time_start<TIME_END and not TS.end):

    #InitializeColumns
    Virality = {}
    Volume   = {}
    Locality = {}
    TimeWindow=[]

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = te(snap)
    #2. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality = GeographicalEntropy(snap,gmm.labels)
    #3. Volume
    Volume   = Count(snap)

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    Words= list(set(dict(Virality.most_common(VocabSize)).keys())&set(dict(Locality.most_common(VocabSize)).keys())&set(dict(Volume.most_common(VocabSize)).keys()))
    if not len(Words)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in Words]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in Words]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in Words]

    #4. TimeWindow
    TimeWindow= [snap['TimeWindow'][0]]*len(Words)
    #5. Words
    Words= Words

    #Append to Dataframe
    df = df.append({'Virality':Virality,'Locality':Locality,'Volume':Volume,'Words':Words,'TimeWindow':TimeWindow},ignore_index=True)

  return df
Exemplo n.º 3
0
def newsworthy_words(db,timeWindow,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",25)


  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-timeWindow)


  Day = {}

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    if len(snap['TEXT'])<100:
      continue
    gmm  = GMM_clustering()
    #1. Virality
    Virality  = PoissonRate(snap)
    #2. DeltaVolume
    #Broadcast = DeltaVolume(snap0,snap)
    #3. Locality
    gmm.Snap = snap
    gmm.build_clusters()
    Locality  = GeographicalEntropy(snap,gmm.labels)
    #4. Prevalence
    #Prevalence= Ttest(snap)
    #5. Count
    Volume     = Count(snap)

    #Prepare Dataframe
    #Union
    #HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys()+dict(Broadcast.most_common(HotWordSize)).keys()+dict(Locality.most_common(HotWordSize)).keys()+dict(Prevalence.most_common(HotWordSize)).keys()+dict(Volume.most_common(HotWordSize)).keys()))
    #Intersection
    #print "Simmering words"
    #print 'Virality',set(dict(Virality.most_common(HotWordSize)).keys())
    #print 'Broadcast',set(dict(Broadcast.most_common(HotWordSize)).keys())
    #print 'Locality',set(dict(Locality.most_common(HotWordSize)).keys())
    #print set(dict(Prevalence.most_common(HotWordSize)).keys())
    #print 'Volume',set(dict(Volume.most_common(HotWordSize)).keys())
    #print "*"*5

    #HotWords= set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Broadcast.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Prevalence.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys())
    HotWords= list(set(dict(Virality.most_common(HotWordSize)).keys())&set(dict(Locality.most_common(HotWordSize)).keys())&set(dict(Volume.most_common(HotWordSize)).keys()))
    if not len(HotWords)>0:
      continue

    Virality= [Virality[key] if key in Virality.keys() else 0 for key in HotWords]
    # Broadcast=[Broadcast[key] if key in Broadcast.keys() else 0 for key in HotWords]
    Locality= [Locality[key] if key in Locality.keys() else 0 for key in HotWords]
    # Prevalence=[Prevalence[key] if key in Prevalence.keys() else 0 for key in HotWords]
    Volume=[Volume[key] if key in Volume.keys() else 0 for key in HotWords]

    #scaler           = preprocessing.MinMaxScaler([0,100]).fit_transform
    #scaledVirality   = list(scaler(np.array([Virality]).T).flatten())
    # scaledBroadcast  = scaler(Broadcast)
    #scaledLocality   = list(scaler(np.array([Locality]).T).flatten())
    # scaledPrevalence = scaler(Prevalence)
    #scaledVolume     = list(scaler(np.array([Volume],dtype=np.float16).T).flatten())
    Score            = [vi+lo+vo for vi,lo,vo in zip(Virality,Locality,Volume)]

    df             = pd.DataFrame({'Words':HotWords,'Virality':Virality,'Locality':Locality,'Volume':Volume,'Score':Score})
    #df_scaled      = pd.DataFrame({'Words':HotWords,'Virality':scaledVirality,'Locality':scaledLocality,'Volume':scaledVolume,'Score':Score})

    Day['to'.join(snap['TimeWindow'])]=df

  return Day
Exemplo n.º 4
0
def print_vocabulary_report(db,scale=60*20,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = scale,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",8)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-scale)

  volume = []
  HotWordsList = []
  ColorGradient = {}
  TweetCountDict    = {}
  TimeList      = []

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    timeWindow = gmt_to_local(TS.time_start,make_string=True,format='%a %H:%M')
    #Volume of tweets
    volume.append(len(snap['LOC']))

    #HotWords List
    Vocab_dict = dict(get_vocabulary(snap['TEXT']).most_common(HotWordSize))
    TimeList.append(timeWindow)

    ColorGradient[timeWindow] = {}

    for word in Vocab_dict.keys():
      ColorGradient[timeWindow][word] = Vocab_dict[word]/float(sum(Vocab_dict.values()))
      if word in TweetCountDict.keys():
        TweetCountDict[word] += Vocab_dict[word]
      else:
        TweetCountDict[word] = Vocab_dict[word]
    print "LOOPING2"



  SortedTweetCount = sorted(TweetCountDict.iteritems(),key=operator.itemgetter(1))
  WordList         = [item[0] for item in SortedTweetCount]
  TweetCountArray = np.array([item[1] for item in SortedTweetCount],dtype=int)
  del SortedTweetCount


  ColorMap = np.empty([len(WordList),len(TimeList)],dtype=float)

  for rw,word in enumerate(WordList):
    for cl,timeWindow in enumerate(TimeList):
      if word in ColorGradient[timeWindow].keys():
        ColorMap[rw][cl] = ColorGradient[timeWindow][word]
      else:
        ColorMap[rw][cl] = 0

  ###PRINT RESULTS
  gs      = gridspec.GridSpec(2,2,width_ratios=[1,2],height_ratios=[1,4])
  gs.update(left=0.05,right=0.48,wspace=0.00000000000000000000000000000000000000005,hspace=0.00000000000000000000000000000000000000005)

  fig1    = plt.figure(figsize=(36,90),dpi=200)


  ax0     = fig1.add_subplot(gs[0,1])
  ax1     = fig1.add_subplot(gs[1,1])
  ax2     = fig1.add_subplot(gs[1,0])
  ax3     = fig1.add_subplot(gs[0,0])

  #TweetVolume
  ax0.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax0.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax0.set_axis_bgcolor('0.95')

  ASCII_WordList = [ word.encode('ascii','ignore') for word in WordList ]
  ax0.plot(np.arange(len(TimeList)),volume,label='NumberOfTweets',linewidth=0.75)
  ax0.legend(loc='upper left',ncol=4)
  ax0.set_xlim(0,len(TimeList)-1)
  ax0.xaxis.tick_top()
  ax0.yaxis.tick_right()
  ax0.set_xticks(np.arange(0,len(TimeList),5))
  ax0.set_xticklabels(TimeList,rotation='vertical')

  #HotWordColorMap
  ax1.imshow(ColorMap,cmap=plt.cm.binary,vmin=ColorMap.min(),vmax=ColorMap.max(),aspect='auto',origin='lower')
  ax1.yaxis.tick_right()
  ax1.set_yticks(np.arange(len(WordList)))
  ax1.set_yticklabels(WordList)
  ax1.set_xticks(np.arange(0,len(TimeList),5))
  ax1.set_xticklabels(TimeList,rotation='vertical')

  ax1.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax1.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)

  #TweetVolumeDistributionOverHotWords
  ax2.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax2.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax2.set_axis_bgcolor('0.95')

  ax2.invert_xaxis()
  ax2.barh(np.arange(len(WordList)),TweetCountArray,align='center')

  #add the numbers to the side of each bar
  PreviousValue = None
  for p, ch in zip(np.arange(len(WordList)), TweetCountArray):
      if ch!=PreviousValue:
        ax2.annotate(str(ch), xy=(ch + 2.5, p - 0.25), va='center')
        PreviousValue = ch
      else:
        continue


  ax2.set_yticks(np.arange(len(WordList)))
  ax2.set_yticklabels(WordList)#,rotation='horizontal')
  ax2.set_ylim(0,len(WordList)-1+0.25)

  #Plot table with assisting information
  #1. Date : Day, Date Year and TIME_START to TIME_END
  #2. TIME_START
  #3. TIME_END
  #4. TIME_WINDOW
  #5. No. of HotWords per TimeWindow
  #6. Total No. of unique HotWords Found
  #7. Max #of Tweets for HotWord & HotWord
  #8. Min #of Tweets for HotWord & HotWord
  #9. Max #of Tweets in a timeWindow & timeWindow
  #10.Mix #of Tweets in a timeWindow & timeWindow

  rowLabels = ['1. Date','2. Start time','3. End time','4. Time Window (seconds)','5. No.Of HotWords per TimeWindow','6. No. of unique hotwords','7. Max #of tweets for HotWord','8. Min #of tweets for HotWord','9. Max #of tweets in a time window','10. Min #of tweets in a time window']
  DateStart = gmt_to_local(TIME_START,make_string=True,format='%a %d %b %Y')
  DateEnd   = gmt_to_local(TIME_END,make_string=True,format='%a %d %b %Y')
  Date      = DateStart if DateStart==DateEnd else DateStart+' to '+DateEnd
  start_time= gmt_to_local(TIME_START,make_string=True,format='%d %b %H:%M')
  end_time  = gmt_to_local(TIME_END,make_string=True,format='%d %b %H:%M')
  cellText  = [Date,start_time,end_time,scale,HotWordSize,len(set(WordList)),TweetCountArray.max(),TweetCountArray.min(),str(max(volume)),str(min(volume))]
  rowLabels.reverse()
  cellText.reverse()
  colLabels = ['Value']
  for y, label, text in zip(range(len(cellText)),rowLabels,cellText):
    ax3.text(0.05,(float(y)/20)+0.05,s='%s : %s'%(label,text),size=20)
  ax3.xaxis.set_visible(False)
  ax3.yaxis.set_visible(False)

  fig1.savefig('%s_to_%spng'%(start_time,end_time),dpi=200,bbox_inches="tight")
  plt.close(fig1)
Exemplo n.º 5
0
class NewsWorthyWords:
    def __init__(self, db, timeWindow=60 * 10, **kwargs):

        print "COLLECTING TWEETS...."
        self.TS = TweetSnap(db=db, timeWindow=timeWindow, Placename2Geocode=False)
        print "COLLECTION OVER...."

        # Variables
        self.SnapStack = []
        self.Candidates = {}
        self.Volume = []

        # Constants
        self.delta = 1.5
        self.enoughSamples = 15.0
        self.SnapLim = 6
        self.StopNewsWords = ["Boston", "day", "time", "love", "today", "Boston-MA"]
        # Set TIME_FRAME
        self.SetStart(kwargs.get("TIME_START", time.gmtime(0)))

        # Storage variables for analysis
        self.Storage = []
        self.StorageDict = pd.DataFrame(
            columns=["word", "Poisson", "LocalEntropy", "GlobalEntropy", "start_time", "event"]
        )
        self.ResultDict = pd.DataFrame(columns=["word", "event_time", "location", "discovered_time", "summary"])

        # Classifier
        self.matrix_w, self.scaler, self.clf = cPickle.load(open("SVClassifier.Store"))

        # Verbosity - 1. Print all messages 2. Print less messages 3. .....
        self.VerboseLevel = kwargs.get("VerboseLevel", 3)

    def verbose(self, text, level=1):
        if level < self.VerboseLevel:
            return
        else:
            print text

    def SetStart(self, TIME_START):
        if isinstance(TIME_START, str):
            TIME_START = time.gmtime(time.mktime(time.strptime(TIME_START, "%d %b %H:%M %Z %Y")))
        TIME_DIFF = time.mktime(TIME_START) - time.mktime(self.TS.time_start)
        if TIME_DIFF > 0:
            self.TS.move_on(TIME_DIFF)

    def run(self):

        while not self.TS.end:

            # Update SnapStack
            if len(self.SnapStack) == self.SnapLim:
                self.SnapStack = self.SnapStack[1:]
                self.Volume = self.Volume[1:]
            self.SnapStack.append(self.TS.next())
            self.Volume.append(Count(self.SnapStack[-1]))

            # Update Candidates origin snap as timeWindow has shifted right
            for key, val in self.Candidates.items():
                if val == -self.SnapLim:
                    self.Candidates.pop(key)
                    self.verbose("This %s word has been removed because it never received enough samples" % key)
                else:
                    self.Candidates[key] = val - 1

            print ("Latest timeWindow %s" % self.SnapStack[-1]["TimeWindow"], 2)
            # Algorithm
            self.verbose("Print looking for new events which happened in this timeWindow", 2)
            self.FindNewEvent()

            self.verbose("Print confirming old/new candidate events which have not been published")
            self.ConfirmEvent()

            if self.Candidates.keys() != []:
                self.verbose("EventCandidates: %s" % self.Candidates.keys(), 2)

    def TotalVolume(self, word, Volume):

        total = 0.0
        k = 0

        while k < len(Volume):
            if word in Volume[k].keys():
                total += Volume[k][word]
            k += 1

        return total if total != 0 else 1

    def FindNewEvent(self):

        for word, count in self.Volume[-1].items():

            # Is word count gaussian noise or signal ?
            wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys()]
            mean = np.mean(wordHistory) if len(wordHistory) > 0 else 1
            var = np.std(wordHistory) if len(wordHistory) >= 5 else 1

            std_score = (count - mean) / (2 * var)

            if std_score >= self.delta and (word not in self.StopNewsWords):

                self.verbose("This %s is not gaussian noise with standard_score = %f " % (word, std_score))
                if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word] < count):
                    self.Candidates[word] = -1

    def ConfirmEvent(self):

        for word, no in self.Candidates.items():

            wordHistory = [float(vol.get(word, 0.0)) for vol in self.Volume[no:]]
            self.verbose(
                "Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d"
                % (word, self.SnapStack[no]["TimeWindow"][0], sum(wordHistory), no),
                2,
            )
            if sum(wordHistory) >= self.enoughSamples:
                self.verbose(
                    "This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)"
                    % (word),
                    2,
                )
                # Poisson
                Poisson = self.FitPoissonDistribution(word, no)
                # Global and Local Entropy
                GlobalEntropy, LocalEntropy = self.FitSpatialEntropy(word, no)

                # Classifier
                # Define feature vector
                X = np.array([Poisson, LocalEntropy, GlobalEntropy], dtype=np.float64)
                # Apply Scaler
                X_sc = self.scaler.transform(X)
                # Apply Orthogonality
                X_tr = X_sc.dot(self.matrix_w)
                # Classify new transformed feature vector
                Flag = self.clf.predict(X_tr)[0]

                if Flag == 1:
                    start_time = self.SnapStack[no]["TimeWindow"][0]
                    confirmed_time = self.SnapStack[-1]["TimeWindow"][0]
                    SampleSet = self.ReportEventQueue(word, no)
                    print "Newsword (%s) at %s confirmed at %s\n" % (word, start_time, confirmed_time)
                    print "Summary : "
                    summary = []
                    for user, created_at, tweet, loc in SampleSet:
                        print "%s reported at time %s near %s: %s" % (user, created_at, loc, tweet)
                        # summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
                        summary.append([user, created_at, tweet, loc])

                    event = {
                        "word": word,
                        "event_time": start_time,
                        "location": GetPlaceName(
                            np.mean([item[3][0] for item in summary]), np.mean([item[3][1] for item in summary])
                        ),
                        "discovered_time": confirmed_time,
                        "summary": "\n".join(
                            ["%s reported at time %s near %s: %s" % (item[0], item[1], item[3], item[2])]
                        ),
                    }
                    print event
                    self.ResultDict = self.ResultDict.append(event, ignore_index=True)
                    self.Candidates.pop(word)

                else:
                    continue

                # Store Data for post-classification
                self.StorageDict = self.StorageDict.append(
                    {
                        "word": word,
                        "Poisson": Poisson,
                        "LocalEntropy": LocalEntropy,
                        "GlobalEntropy": GlobalEntropy,
                        "start_time": start_time,
                        "event": event,
                    },
                    ignore_index=True,
                )

                # Manual Classifier
                # if flag in ['1','y','yes']:
                # 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
                # 		self.ReportEventQueue(word,no)
                # 		self.Candidates.pop(word)
                # else:
                # 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

    def FitSpatialEntropy(self, word, no):

        k = no
        tokenize = T_Tokenizer().tokenize
        # Store locations
        ALLLOC = []
        WORDLOC = []

        while k < 0:

            ALLLOC += self.SnapStack[k]["LOC"]
            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    WORDLOC.append(self.SnapStack[k]["LOC"][order])

            k += 1

        # Choose Cluster of max ALLLOC, C*
        MakeCluster = GMM_clustering()
        MakeCluster.Snap = {"LOC": ALLLOC}
        MakeCluster.build_clusters()
        WORDLABELS = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

        # Global entropy
        GLOBAL_COUNTER = Counter(MakeCluster.labels)
        G_D_pq = 0.0
        for cl, number in WORDLABELS.items():
            G_D_pq += -1 * (number / float(GLOBAL_COUNTER[cl])) * np.log2(number / float(GLOBAL_COUNTER[cl]))
            # G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))

        C_Star = WORDLABELS.most_common(1)[0][0]
        C_Star_LOC = [ALLLOC[No] for No, label in filter(lambda (enum, x): x == C_Star, enumerate(MakeCluster.labels))]
        C_Star_WORD_LOC = [LOC for LOC in filter(lambda x: x in C_Star_LOC, WORDLOC)]

        # Find D(p||q) of word inside C*
        del MakeCluster
        MakeLocalCluster = GMM_clustering(components=range(2, 8))
        MakeLocalCluster.Snap = {"LOC": C_Star_LOC}
        MakeLocalCluster.build_clusters()

        WORD_LOCAL_COUNTER = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
        LOCAL_ALL_COUNTER = Counter(MakeLocalCluster.labels)
        L_D_pq = 0.0
        for cl, number in WORD_LOCAL_COUNTER.items():
            L_D_pq += -1 * (number / float(LOCAL_ALL_COUNTER[cl])) * np.log2(number / float(LOCAL_ALL_COUNTER[cl]))
            # L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

        return [G_D_pq, L_D_pq]

    def FitStdDev(self, word, no):

        k = no
        tokenize = T_Tokenizer().tokenize
        # Store locations
        WORDLOC = []

        while k < 0:
            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    WORDLOC.append(self.SnapStack[k]["LOC"][order])
            k += 1

        return np.std(WORDLOC)

    def FitPoissonDistribution(self, word, no):

        tokenize = T_Tokenizer().tokenize

        k = no
        Times = []

        ApproxTimes = []

        wordHistory = [vol.get(word, 0) for vol in self.Volume[no:]]

        # Store all tweet_times with word in current snap and known history
        while k < 0:

            approx = time.mktime(time.strptime(self.SnapStack[k]["TimeWindow"][0] + "2014EDT", "%d%b%HHR%MMN%Y%Z"))
            count = self.Volume[k].get(word, 0)
            ApproxTimes += [approx] * count

            for order, text in enumerate(self.SnapStack[k]["TEXT"]):
                if word in tokenize(text):
                    Times.append(
                        time.mktime(time.strptime(self.SnapStack[k]["CREATED_AT"][order], "%d %b %H:%M:%S %Y"))
                    )
            k += 1

        # Calculate time-intervals
        TimeIntervals = [Time - min(Times) for Time in Times]
        ApproxTimeIntervals = sorted([approx - min(ApproxTimes) for approx in ApproxTimes])
        TimeIntervals.sort()
        self.verbose("Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)")
        self.verbose("(1) %s" % TimeIntervals)
        self.verbose("(2) %s" % ApproxTimeIntervals)

        ApproxTimeIntervals = Counter(ApproxTimeIntervals)

        # Calculate ML_Lmbda
        _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals)
        if sum(ApproxTimeIntervals) != 0:
            _lmbda = float(len(ApproxTimeIntervals)) / sum(ApproxTimeIntervals)
        else:
            _lmbda = float(len(TimeIntervals)) / sum(TimeIntervals)

        return _lmbda

    def ReportEventQueue(self, word, no, SampleLim=3):

        # Find clusters at start point of event
        gmm = GMM_clustering(components=range(4, 15))
        gmm.Snap = self.SnapStack[no]
        gmm.build_clusters()
        Labels = []
        tokenize = T_Tokenizer().tokenize
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if word in tokenize(text):
                Labels.append(gmm.labels[k])
        Labels = Counter(Labels)
        # Find cluster where word was most common
        StarLabel = Labels.most_common(1)[0][0]

        SampleSet = []
        # Print a tweet from that cluster
        for k, text in enumerate(gmm.Snap["TEXT"]):
            if gmm.labels[k] == StarLabel and word in tokenize(text):
                SampleSet.append((gmm.Snap["SCREEN_NAME"][k], gmm.Snap["CREATED_AT"][k], text, gmm.Snap["LOC"][k]))
            if len(SampleSet) >= SampleLim:
                break

        return SampleSet
Exemplo n.º 6
0
class NewsWorthyWords:

	def __init__(self,db,timeWindow=60*10,**kwargs):

		print "COLLECTING TWEETS...."
		self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
		print "COLLECTION OVER...."

		#Variables
		self.SnapStack = []
		self.Candidates= {}
		self.Volume		= []

		#Constants
		self.delta    				 = 1.5
		self.enoughSamples     = 15.0
		self.SnapLim           = 6
		self.StopNewsWords     = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA']
		#Set TIME_FRAME
		self.SetStart(kwargs.get("TIME_START",time.gmtime(0)))

		#Storage variables for analysis
		self.Storage = []
		self.StorageDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event'])
		self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary'])

		#Classifier
		self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store'))

		#Verbosity - 1. Print all messages 2. Print less messages 3. .....
		self.VerboseLevel = kwargs.get('VerboseLevel',1)

	def verbose(self,text,level=1):
		if level<self.VerboseLevel:
			return
		else:
			print text

	def SetStart(self,TIME_START):
		if isinstance(TIME_START,str):
			TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
		TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(self.TS.time_start)
		if TIME_DIFF>0:
			self.TS.move_on(TIME_DIFF)

	def run(self):

		while not self.TS.end:

			#Update SnapStack
			if len(self.SnapStack)==self.SnapLim:
				self.SnapStack = self.SnapStack[1:]
				self.Volume    = self.Volume[1:]
			self.SnapStack.append(self.TS.next())
			self.Volume.append(Count(self.SnapStack[-1]))

			#Update Candidates origin snap as timeWindow has shifted right
			for key,val in self.Candidates.items():
				if val==-self.SnapLim:
					self.Candidates.pop(key)
					self.verbose('This %s word has been removed because it never received enough samples'%key)
				else:
					self.Candidates[key]=val-1


			print('Latest timeWindow %s'%self.SnapStack[-1]['TimeWindow'],2)
			#Algorithm
			self.verbose('Print looking for new events which happened in this timeWindow',2)
			self.FindNewEvent()

			self.verbose('Print confirming old/new candidate events which have not been published')
			self.ConfirmEvent()


			if self.Candidates.keys() !=[]: self.verbose('EventCandidates: %s'%self.Candidates.keys(),2);



	def TotalVolume(self,word,Volume):

		total = 0.0
		k		 = 0

		while k < len(Volume):
			if word in Volume[k].keys():
				total += Volume[k][word]
			k+=1

		return total if total!=0 else 1


	def FindNewEvent(self):

		for word,count in self.Volume[-1].items():

				#Is word count gaussian noise or signal ?
				wordHistory = [float(vol[word]) for vol in self.Volume[:-1] if word in vol.keys() ]
				mean        =  np.mean(wordHistory) if len(wordHistory)>0 else 1
				var				 =  np.std(wordHistory) if len(wordHistory)>=5 else 1

				std_score = (count - mean)/(2*var)

				if std_score>=self.delta and (word not in self.StopNewsWords):

					self.verbose('This %s is not gaussian noise with standard_score = %f '%(word,std_score))
					if word not in self.Candidates.keys() or (self.Volume[self.Candidates[word]][word]<count):
						self.Candidates[word] = -1

	def ConfirmEvent(self):

		for word,no in self.Candidates.items():

			wordHistory = [float(vol.get(word,0.0)) for vol in self.Volume[no:]]
			self.verbose('Confirming candidate Newsword : %s at time = %s with samples=%d and Snapno=%d'%(word,self.SnapStack[no]['TimeWindow'][0],sum(wordHistory),no),2)
			if sum(wordHistory)>=self.enoughSamples:
				self.verbose('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word),2)
				#Poisson
				Poisson = self.FitPoissonDistribution(word,no)
				#Global and Local Entropy
				GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no)

				#Classifier
				#Define feature vector
				X    = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64)
				#Apply Scaler
				X_sc = self.scaler.transform(X)
				#Apply Orthogonality
				X_tr = X_sc.dot(self.matrix_w)
				#Classify new transformed feature vector
				Flag = self.clf.predict(X_tr)[0]

				if Flag==1:
					start_time  = self.SnapStack[no]['TimeWindow'][0]
					confirmed_time = self.SnapStack[-1]['TimeWindow'][0]
					SampleSet   = self.ReportEventQueue(word,no)
					print       "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time)
					print       "Summary : "
					summary     = []
					for user,created_at,tweet,loc in SampleSet:
						print "%s reported at time %s near %s: %s"%(user,created_at,GetPlaceName(loc[0],loc[1]),tweet)
						#summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
						summary.append([user,created_at,tweet,loc])

					event =  {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],GetPlaceName(item[3][0],item[3][1]),item[2]) for item in summary])}
					print event
					self.ResultDict = self.ResultDict.append(event,ignore_index=True)
					self.Candidates.pop(word)

				else:
					continue



				#Store Data for post-classification
				self.StorageDict = self.StorageDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True)


				#Manual Classifier
				# if flag in ['1','y','yes']:
				# 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
				# 		self.ReportEventQueue(word,no)
				# 		self.Candidates.pop(word)
				# else:
				# 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

	def FitSpatialEntropy(self,word,no):

		k = no
		tokenize  = T_Tokenizer().tokenize
		#Store locations
		ALLLOC = []
		WORDLOC = []

		while k<0:

			ALLLOC += self.SnapStack[k]['LOC']
			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					WORDLOC.append(self.SnapStack[k]['LOC'][order])

			k+=1

		#Choose Cluster of max ALLLOC, C*
		MakeCluster 	 	= GMM_clustering()
		MakeCluster.Snap = {'LOC':ALLLOC}
		MakeCluster.build_clusters()
		WORDLABELS       = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

		#Global entropy
		GLOBAL_COUNTER = Counter(MakeCluster.labels)
		G_D_pq		   = 0.0
		for cl,number in WORDLABELS.items():
				G_D_pq	+= -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl]))
				#G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))


		C_Star					 = WORDLABELS.most_common(1)[0][0]
		C_Star_LOC       = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
		C_Star_WORD_LOC  = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)]

		#Find D(p||q) of word inside C*
		del MakeCluster
		MakeLocalCluster 	 	= GMM_clustering(components=range(2,8))
		MakeLocalCluster.Snap = {'LOC':C_Star_LOC}
		MakeLocalCluster.build_clusters()

		WORD_LOCAL_COUNTER    = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
		LOCAL_ALL_COUNTER		 = Counter( MakeLocalCluster.labels )
		L_D_pq		   = 0.0
		for cl,number in WORD_LOCAL_COUNTER.items():
			  L_D_pq	+= -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))
				#L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

		return [G_D_pq,L_D_pq]

	def FitStdDev(self,word,no):

		k = no
		tokenize  = T_Tokenizer().tokenize
		#Store locations
		WORDLOC= []

		while k<0:
			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					WORDLOC.append(self.SnapStack[k]['LOC'][order])
			k+=1

		return np.std(WORDLOC)

	def FitPoissonDistribution(self,word,no):

		tokenize  = T_Tokenizer().tokenize

		k = no
		Times = []

		ApproxTimes = []

		wordHistory = [vol.get(word,0) for vol in self.Volume[no:]]

		#Store all tweet_times with word in current snap and known history
		while k<0:

			approx = time.mktime(time.strptime(self.SnapStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z"))
			count  = self.Volume[k].get(word,0)
			ApproxTimes+=[approx]*count

			for order,text in enumerate(self.SnapStack[k]['TEXT']):
				if word in tokenize(text):
					Times.append(\
									time.mktime(time.strptime(self.SnapStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y")))
			k+=1

	  #Calculate time-intervals
		TimeIntervals = [Time-min(Times) for Time in Times]
		ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes])
		TimeIntervals.sort()
		self.verbose('Have a look at TimeIntervals(1) and ApproxTimeIntervals(2) and LogLikelihood(3)')
		self.verbose('(1) %s'%TimeIntervals)
		self.verbose('(2) %s'%ApproxTimeIntervals)

		ApproxTimeIntervals = Counter(ApproxTimeIntervals)

		#Calculate ML_Lmbda
		_lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)
		# if sum(ApproxTimeIntervals)!=0:
		# 	_lmbda      = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals)
		# else:
		# 	_lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)

		#Calculate Variance for given samples
		# _R2         = 1/_lmbda**2

		#Likelihood calculation and plotting (optional)

		# MaxLogLikelihood
		# _LgLd 			= -1*sum([np.log(_lmbda*np.exp(-_lmbda*x)) for x in TimeIntervals])
		# print '(3)',_LgLd
		#
		# #Simulate a expon_RV with fitted _lmbda
		# _rv         = expon(scale=1/_lmbda)
		#
		# #Plot pdf of counts from _rv and known
		# fig = plt.figure()
		# ax  = fig.add_subplot(111)
		# ax.plot(sorted(ApproxTimeIntervals.keys()),[_rv.cdf(x+600)-_rv.cdf(x) for x in sorted(ApproxTimeIntervals.keys())],'r-',label='fitted')
		# ax.plot(sorted(ApproxTimeIntervals.keys()),[float(ApproxTimeIntervals[key])/sum(wordHistory) for key in sorted(ApproxTimeIntervals.keys()) ],'b-'\
		# 				,label='empirical estimate')
		#
		# plt.legend()
		#
		# #save figure
		# fig.savefig('%s.png'%word)
		#
		# gmm  = GMM_clustering(components=range(4,15))
		# gmm.Snap = self.SnapStack[no]
		# gmm.build_clusters()
		#
		# #flag = raw_input("Fitted curve for %s stored should flag=1 or not with lambda=%f and locality=%f"%(word,_lmbda,Locality(self.SnapStack[no],gmm.labels,word)))
		# plt.close(fig)

		return _lmbda

	def ReportEventQueue(self,word,no,SampleLim=3):

		#Find clusters at start point of event
		gmm  = GMM_clustering(components=range(4,15))
		gmm.Snap = self.SnapStack[no]
		gmm.build_clusters()
		Labels = []
		tokenize  = T_Tokenizer().tokenize
		for k,text in enumerate(gmm.Snap['TEXT']):
			if word in tokenize(text):
				Labels.append(gmm.labels[k])
		Labels = Counter(Labels)
		#Find cluster where word was most common
		StarLabel = Labels.most_common(1)[0][0]

		SampleSet = []
		#Print a tweet from that cluster
		for k,text in enumerate(gmm.Snap['TEXT']):
			if gmm.labels[k] == StarLabel and word in tokenize(text):
				SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k]))
			if len(SampleSet)>=SampleLim:
				break

		return SampleSet
Exemplo n.º 7
0
class NewsWorthyWords:

  def __init__(self,db,timeWindow=60*10,**kwargs):

    print "COLLECTING TWEETS...."
    self.TS = TweetSnap(db=db,timeWindow = timeWindow,Placename2Geocode=False)
    print "COLLECTION OVER...."

    #Variables
    self.QueueStack     = []
    self.Candidates    = {}
    self.Vocabulary		= []

    #Constants
    self.delta    				 = 3  #GaussianDistortion
    self.MinWordSamples     = 15.0 #Has to be greater than 8 See SetFeatureTable method for this restriction

    self.QueueLim           = 6  #MaximumQueueLimit

    self.StopNewsWords     = ['Boston', 'day', 'time', 'love', 'today', 'Boston-MA']  #Default StopWordList

    #Set TIME_FRAME
    self.SetStart(kwargs.get("TIME_START",time.gmtime(0)))

    #Storage variables for analysis
    self.FeatureDict = pd.DataFrame(columns=['word','Poisson','LocalEntropy','GlobalEntropy','start_time','event'])
    self.ResultDict = pd.DataFrame(columns=['word','event_time','location','discovered_time','summary'])

    #Classifier
    self.matrix_w, self.scaler, self.clf = cPickle.load(open('SVClassifier.Store'))

    #Verbosity - 1. Print all messages 2. Print less messages 3. .....
    self.OnlyMessage = kwargs.get('OnlyMessage',0)

  def message(self,text):
    if self.OnlyMessage:
      print text
    else:
      pass

  def SetStart(self,TIME_START):
    if isinstance(TIME_START,str):
      TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
    TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(self.TS.time_start)
    if TIME_DIFF>0:
      self.TS.move_on(TIME_DIFF)

  def run(self):

    while not self.TS.end:

      #Update QueueStack
      if len(self.QueueStack)==self.QueueLim:
        self.QueueStack = self.QueueStack[1:]
        self.Vocabulary    = self.Vocabulary[1:]
      self.QueueStack.append(self.TS.next())
      self.Vocabulary.append(Count(self.QueueStack[-1]))

      #Update Candidates origin snap as timeWindow has shifted right
      for key,val in self.Candidates.items():
        if val==-self.QueueLim:
          self.Candidates.pop(key)
          self.message('This %s word has been removed because it never received enough samples'%key)
        else:
          self.Candidates[key]=val-1


      print('Latest timeWindow %s'%self.QueueStack[-1]['TimeWindow'])
      #Algorithm
      #1. Add to candidates list
      self.FilterWords()
      #1.1
      if self.TableON==1 and len(self.Candidates.keys())!=0:
        self.SetFeatureTable()
      #2. Find news-word in candidate list
      self.ConfirmEvent()
      #Status of candidate list
      if self.Candidates.keys() !=[]: self.message('EventCandidates: %s'%self.Candidates.keys());

  def FilterWords(self):

    for word,count in self.Vocabulary[-1].items():

        #Is word count gaussian noise or signal ?
        wordHistory = [float(vol[word]) for vol in self.Vocabulary[:-1] if word in vol.keys() ]
        mean        =  np.mean(wordHistory) if len(wordHistory)>0 else 1
        variance 	 =  np.std(wordHistory) if len(wordHistory)>=5 else 1

        Z_score = (count - mean)/variance

        if Z_score>=self.delta and (word not in self.StopNewsWords):

          self.message('This %s is not gaussian noise with standard_score = %f '%(word,Z_score))
          if word not in self.Candidates.keys() or (self.Vocabulary[self.Candidates[word]][word]<count):
            self.Candidates[word] = -1

  def ConfirmEvent(self):

    for word,no in self.Candidates.items():

      wordHistory = [float(vol.get(word,0.0)) for vol in self.Vocabulary[no:]]

      self.message('Confirming candidate Newsword : %s at time = %s with samples=%d and Queueno=%d'%(word,self.QueueStack[no]['TimeWindow'][0],sum(wordHistory),no))

      if sum(wordHistory)>=self.MinWordSamples:

        self.message('This %s word has enough samples from tweets to calculate scores (Poisson,LocalEntropy,StandardDeviation)'%(word))

        #Poisson
        Poisson = self.FitPoissonDistribution(word,no)
        #Global and Local Entropy
        GlobalEntropy,LocalEntropy = self.FitSpatialEntropy(word,no)

        #Poisson, GlobalEntropy, LocalEntropy = self.GetFeatures(word,no)

        #Classifier
        #Define feature vector
        X    = np.array([Poisson,LocalEntropy,GlobalEntropy],dtype=np.float64)
        #Apply Scaler
        X_sc = self.scaler.transform(X)
        #Apply Orthogonality
        X_tr = X_sc.dot(self.matrix_w)
        #Classify new transformed feature vector
        Flag = self.clf.predict(X_tr)[0]

        if Flag==1:
          start_time  = self.QueueStack[no]['TimeWindow'][0]
          confirmed_time = self.QueueStack[-1]['TimeWindow'][0]
          SampleSet   = self.ReportEventQueue(word,no)
          print       "Newsword (%s) at %s confirmed at %s\n"%(word,start_time,confirmed_time)
          print       "Summary : "
          summary     = []
          for user,created_at,tweet,loc in SampleSet:
            print "%s reported at time %s near %s: %s"%(user,created_at,loc,tweet)
            #summary.append("%s reported at time %s near %s: %s"%(user,created_at,tweet,GetPlaceName(loc[0],loc[1]))
            summary.append([user,created_at,tweet,loc])

          event =  {'word':word,'event_time':start_time,'location':GetPlaceName(np.mean([item[3][0] for item in summary]),np.mean([item[3][1] for item in summary])),'discovered_time':confirmed_time,'summary':'\n'.join([ "%s reported at time %s near %s: %s"%(item[0],item[1],item[3],item[2]) for item in summary])}
          print event
          self.ResultDict = self.ResultDict.append(event,ignore_index=True)
          self.Candidates.pop(word)

        else:
          continue



        #Store Data for post-classification
        self.FeatureDict = self.FeatureDict.append({'word':word,'Poisson':Poisson,'LocalEntropy':LocalEntropy,'GlobalEntropy':GlobalEntropy,'start_time':start_time,'event':event},ignore_index=True)


        #Manual Classifier
        # if flag in ['1','y','yes']:
        # 		print 'This %s word count resembles poisson distribution with lambda=%f'%(word,Lambda)
        # 		self.ReportEventQueue(word,no)
        # 		self.Candidates.pop(word)
        # else:
        # 		print 'This %s word count does not resembles poisson distribution with lambda=%s'%(word,Lambda)

  def SetFeatureTable(self):

    tokenize  = T_Tokenizer().tokenize
    self.Feature = {}
    k = -len(self.QueueStack)

    #Store locations
    ALL_LOC  = []
    WORD_LOC = {}
    C_Star_LOC = {}
    C_Star_Labels = {}

    #Get List of locations of all tweets Collected : ALL_LOC
    #Get List of locations where "word" appears in tweets posted after it was declared as an event
    #    : WORD_LOC[word]
    while k<0:
       ALL_LOC += self.QueueStack[k]['LOC']
       for order,text in enumerate(self.QueueStack[k]['TEXT']):
         for word,no in self.Candidates.items():
           if word in tokenize(text) and order>=no:
             WORD_LOC.setdefault(word,[]).append(self.QueueStack[k]['LOC'][order])

       k+=1

    #Global Clustering
    MakeCluster 	 	= GMM_clustering(components=range(3,8))
    MakeCluster.Snap = {'LOC':ALL_LOC}
    MakeCluster.build_clusters()
    #Input : ALL_LOC & Output : Global labels for locations of tweets
    GLOBAL_LABELS    = Counter(MakeCluster.labels)

    #Local Clustering for each cluster in lists
    for C_Star in GLOBAL_LABELS.keys():

      #Input : C_Star_LOC ; All tweet locations withing C_Star cluster
      C_Star_LOC[C_Star]    = [ ALL_LOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
      if len(C_Star_LOC[C_Star])>=(self.MinWordSamples/3.0):
        MakeLocalCluster 	 	= GMM_clustering(components=range(2,min(8,int(self.MinWordSamples/3))))
        MakeLocalCluster.Snap = {'LOC':C_Star_LOC[C_Star]}
        MakeLocalCluster.build_clusters()

        #Output : C_Star_Labels ; Labels for All tweet locations withing C_Star cluster
        C_Star_Labels[C_Star] = MakeLocalCluster.labels

    #Set GlobalEntropy and LocalEntropy for each Candidate word
    for word,no in self.Candidates.items():

      #Global entropy
      #1. Initialize to 0
      G_D_pq 		   = 0.0
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LABELS   = Counter([MakeCluster.labels[ALL_LOC.index(LOC)] for LOC in WORD_LOC[word]])
      #3. Calculate entropy by summing up over all clusters
      for cl,number in WORD_LABELS.items():
          G_D_pq	+= -1*(number/float(GLOBAL_LABELS[cl]))*np.log2(number/float(GLOBAL_LABELS[cl]))
          #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))

      #Local entropy
      #1. Most populated cluster with 'word'
      C_Star					 = WORD_LABELS.most_common(1)[0][0]
      #2. List of all non-zero counts for global clusters where 'word' appears in tweet
      WORD_LOCAL_LABELS     = Counter([C_Star_Labels[C_Star][C_Star_LOC[C_Star].index(LOC)] for LOC in WORD_LOC[word] if LOC in C_Star_LOC[C_Star]])
      LOCAL_LABELS 		     = Counter( C_Star_Labels[C_Star] )
      #3. Calculate entropy by summing up over all local clusters
      L_D_pq		   = 0.0
      for cl,number in WORD_LOCAL_LABELS.items():
          L_D_pq	+= -1*(number/float(LOCAL_LABELS[cl]))*np.log2(number/float(LOCAL_LABELS[cl]))
          #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

      self.Feature[word] = [G_D_pq,L_D_pq,self.GetPoissonRate(word,no)]

  def FitSpatialEntropy(self,word,no):

    if self.TableON:
      return [self.Feature[word][0],self.Feature[word][1]]

    k = no
    tokenize  = T_Tokenizer().tokenize
    #Store locations
    ALLLOC = []
    WORDLOC = []

    while k<0:

      ALLLOC += self.QueueStack[k]['LOC']
      for order,text in enumerate(self.QueueStack[k]['TEXT']):
        if word in tokenize(text):
          WORDLOC.append(self.QueueStack[k]['LOC'][order])

      k+=1

    #Choose Cluster of max ALLLOC, C*
    MakeCluster 	 	= GMM_clustering()
    MakeCluster.Snap = {'LOC':ALLLOC}
    MakeCluster.build_clusters()
    WORDLABELS       = Counter([MakeCluster.labels[ALLLOC.index(LOC)] for LOC in WORDLOC])

    #Global entropy
    GLOBAL_COUNTER = Counter(MakeCluster.labels)
    G_D_pq		   = 0.0
    for cl,number in WORDLABELS.items():
        G_D_pq	+= -1*(number/float(GLOBAL_COUNTER[cl]))*np.log2(number/float(GLOBAL_COUNTER[cl]))
        #G_D_pq	+= -1*((number/sum(WORDLABELS))/float(GLOBAL_COUNTER[cl]/sum(GLOBAL_COUNTER)))*np.log2(number/float(GLOBAL_COUNTER[cl]))


    C_Star					 = WORDLABELS.most_common(1)[0][0]
    C_Star_LOC       = [ ALLLOC[No] for No,label in filter(lambda (enum,x): x==C_Star,enumerate(MakeCluster.labels)) ]
    C_Star_WORD_LOC  = [LOC for LOC in filter(lambda x:x in C_Star_LOC,WORDLOC)]

    #Find D(p||q) of word inside C*
    del MakeCluster
    MakeLocalCluster 	 	= GMM_clustering(components=range(2,8))
    MakeLocalCluster.Snap = {'LOC':C_Star_LOC}
    MakeLocalCluster.build_clusters()

    WORD_LOCAL_COUNTER    = Counter([MakeLocalCluster.labels[C_Star_LOC.index(LOC)] for LOC in C_Star_WORD_LOC])
    LOCAL_ALL_COUNTER		 = Counter( MakeLocalCluster.labels )
    L_D_pq		   = 0.0
    for cl,number in WORD_LOCAL_COUNTER.items():
        L_D_pq	+= -1*(number/float(LOCAL_ALL_COUNTER[cl]))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))
        #L_D_pq	+= -1*((number/sum(WORD_LOCAL_COUNTER.values()))/float(LOCAL_ALL_COUNTER[cl]/sum(LOCAL_ALL_COUNTER.values())))*np.log2(number/float(LOCAL_ALL_COUNTER[cl]))

    return [G_D_pq,L_D_pq]


  def GetPoissonRate(self,word,no):

    tokenize  = T_Tokenizer().tokenize

    k = no
    Times = []
    ApproxTimes = []

    #Store all tweet_times with word in current snap and known history
    while k<0:

      approx = time.mktime(time.strptime(self.QueueStack[k]['TimeWindow'][0]+'2014EDT',"%d%b%HHR%MMN%Y%Z"))
      count  = self.Vocabulary[k].get(word,0)
      ApproxTimes+=[approx]*count

      for order,text in enumerate(self.QueueStack[k]['TEXT']):
        if word in tokenize(text):
          Times.append(\
                  time.mktime(time.strptime(self.QueueStack[k]['CREATED_AT'][order],"%d %b %H:%M:%S %Y")))
      k+=1

    #Calculate time-intervals
    TimeIntervals       = sorted([Time-min(Times) for Time in Times])
    ApproxTimeIntervals = sorted([ approx-min(ApproxTimes) for approx in ApproxTimes])

    #Calculate ML_Lmbda
    if sum(ApproxTimeIntervals)!=0:
      _lmbda      = float(len(ApproxTimeIntervals))/sum(ApproxTimeIntervals)
    else:
      _lmbda      = float(len(TimeIntervals))/sum(TimeIntervals)

    return _lmbda



  def ReportEventQueue(self,word,no,SampleLim=3):

    #Find clusters at start point of event
    gmm  = GMM_clustering(components=range(4,15))
    gmm.Snap = self.QueueStack[no]
    gmm.build_clusters()
    Labels = []
    tokenize  = T_Tokenizer().tokenize
    for k,text in enumerate(gmm.Snap['TEXT']):
      if word in tokenize(text):
        Labels.append(gmm.labels[k])
    Labels = Counter(Labels)
    #Find cluster where word was most common
    StarLabel = Labels.most_common(1)[0][0]

    SampleSet = []
    #Print a tweet from that cluster
    for k,text in enumerate(gmm.Snap['TEXT']):
      if gmm.labels[k] == StarLabel and word in tokenize(text):
        SampleSet.append((gmm.Snap['SCREEN_NAME'][k],gmm.Snap['CREATED_AT'][k],text,gmm.Snap['LOC'][k]))
      if len(SampleSet)>=SampleLim:
        break

    return SampleSet