Python get_vocabulary示例，SearchUtils.get_vocabulary Python示例

示例#1

0

显示文件

文件： algo.py 项目： shkr/tweet-event

    def evolving_list(self):

        # Skip Snaps with less than threshold tweets
        while len(self.Clustering.Snap["LOC"]) < self.rate_threshold:
            print "Found only %d tweets skipping  %s" % (
                len(self.Clustering.Snap["LOC"]),
                self.Clustering.Snap["TimeWindow"],
            )
            self.Clustering.next()

        # go = raw_input('Look at next time snap %s ?'%self.Clustering.Snap['TimeWindow'])

        # if go in ['yes','y',1,'go']:
        while not self.Clustering.SnapIter.end:
            # Build clusters from tweetSnap
            labels = {}
            self.Clustering.build_clusters()

            # Collect indices of different clusters in dict
            for k, l in enumerate(self.Clustering.labels):
                labels.setdefault(l, []).append(k)

            # Make vocabulary of text from tokenized tweet
            vocabulary = get_vocabulary([text for text in self.Clustering.Snap["TEXT"]], self.tokenize)

            # Search for events in tweetSnap
            for event in self.buzz(labels, vocabulary):
                self.ResultDict = self.ResultDict.append(event, ignore_index=True)
                print event

示例#2

0

显示文件

文件： algo.py 项目： shkr/tweet-event

    def folium_map(self):
        """Generates a leaflet map with eventful tweets on the map"""

        # Skip Snaps with less than threshold tweets
        while len(self.Clustering.Snap["LOC"]) < self.rate_threshold:
            print "Found only %d tweet(s) skipping for timeWindow %s" % (
                len(self.Clustering.Snap["LOC"]),
                self.Clustering.Snap["TimeWindow"],
            )
            self.Clustering.next()

        # lat,lon =  location[self.place]['latitude'],location[self.place]['longitude']
        lat, lon = (42.3606249, -71.0591155)

        go = raw_input("Look at next time snap %s ?" % self.Clustering.Snap["TimeWindow"])

        if go in ["yes", "y", 1, "go"]:

            # Build clusters from tweetSnap
            labels = {}
            self.Clustering.build_clusters()

            # Collect indices of different clusters in dict
            for k, l in enumerate(self.Clustering.labels):
                labels.setdefault(l, []).append(k)

            # Make vocabulary of text from tokenized tweet
            vocabulary = get_vocabulary(self.Clustering.Snap["TEXT"], self.tokenize)

            map_1 = folium.Map(location=[lat, lon], zoom_start=8, tiles="Stamen Terrain")

            # Search for events in tweetSnap
            for event in self.buzz(labels, vocabulary):
                popup = event.summary().encode("ascii", "ignore")
                print "Event :" + popup
                map_1.simple_marker(location=[event.location[0], event.location[1]], popup=popup)

            map_1.create_map(path="folium_map_%s.html" % self.Clustering.Snap["TimeWindow"])
            del map_1

示例#3

0

显示文件

文件： visualization.py 项目： shkr/tweet-event

def print_vocabulary_report(db,scale=60*20,**kwargs):

  print "COLLECTING TWEETS...."
  TS = TweetSnap(db=db,timeWindow = scale,Placename2Geocode=False)
  print "COLLECTION OVER...."

  TIME_START = kwargs.get("TIME_START",time.gmtime(0))
  TIME_END   = kwargs.get("TIME_END",time.gmtime(time.time()))
  HotWordSize = kwargs.get("HotWordSize",8)

  if isinstance(TIME_START,str):
    TIME_START  = time.gmtime(time.mktime(time.strptime(TIME_START,"%d %b %H:%M %Z %Y")))
  if isinstance(TIME_END,str):
    TIME_END    = time.gmtime(time.mktime(time.strptime(TIME_END,"%d %b %H:%M %Z %Y")))

  TIME_DIFF   = time.mktime(TIME_START)  - time.mktime(TS.time_start)

  if TIME_DIFF>0:
    TS.move_on(TIME_DIFF-scale)

  volume = []
  HotWordsList = []
  ColorGradient = {}
  TweetCountDict    = {}
  TimeList      = []

  while (TS.time_start<TIME_END and not TS.end):

    #Capture nextSnap and initialize time_start of next snap
    snap = TS.next()
    timeWindow = gmt_to_local(TS.time_start,make_string=True,format='%a %H:%M')
    #Volume of tweets
    volume.append(len(snap['LOC']))

    #HotWords List
    Vocab_dict = dict(get_vocabulary(snap['TEXT']).most_common(HotWordSize))
    TimeList.append(timeWindow)

    ColorGradient[timeWindow] = {}

    for word in Vocab_dict.keys():
      ColorGradient[timeWindow][word] = Vocab_dict[word]/float(sum(Vocab_dict.values()))
      if word in TweetCountDict.keys():
        TweetCountDict[word] += Vocab_dict[word]
      else:
        TweetCountDict[word] = Vocab_dict[word]
    print "LOOPING2"



  SortedTweetCount = sorted(TweetCountDict.iteritems(),key=operator.itemgetter(1))
  WordList         = [item[0] for item in SortedTweetCount]
  TweetCountArray = np.array([item[1] for item in SortedTweetCount],dtype=int)
  del SortedTweetCount


  ColorMap = np.empty([len(WordList),len(TimeList)],dtype=float)

  for rw,word in enumerate(WordList):
    for cl,timeWindow in enumerate(TimeList):
      if word in ColorGradient[timeWindow].keys():
        ColorMap[rw][cl] = ColorGradient[timeWindow][word]
      else:
        ColorMap[rw][cl] = 0

  ###PRINT RESULTS
  gs      = gridspec.GridSpec(2,2,width_ratios=[1,2],height_ratios=[1,4])
  gs.update(left=0.05,right=0.48,wspace=0.00000000000000000000000000000000000000005,hspace=0.00000000000000000000000000000000000000005)

  fig1    = plt.figure(figsize=(36,90),dpi=200)


  ax0     = fig1.add_subplot(gs[0,1])
  ax1     = fig1.add_subplot(gs[1,1])
  ax2     = fig1.add_subplot(gs[1,0])
  ax3     = fig1.add_subplot(gs[0,0])

  #TweetVolume
  ax0.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax0.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax0.set_axis_bgcolor('0.95')

  ASCII_WordList = [ word.encode('ascii','ignore') for word in WordList ]
  ax0.plot(np.arange(len(TimeList)),volume,label='NumberOfTweets',linewidth=0.75)
  ax0.legend(loc='upper left',ncol=4)
  ax0.set_xlim(0,len(TimeList)-1)
  ax0.xaxis.tick_top()
  ax0.yaxis.tick_right()
  ax0.set_xticks(np.arange(0,len(TimeList),5))
  ax0.set_xticklabels(TimeList,rotation='vertical')

  #HotWordColorMap
  ax1.imshow(ColorMap,cmap=plt.cm.binary,vmin=ColorMap.min(),vmax=ColorMap.max(),aspect='auto',origin='lower')
  ax1.yaxis.tick_right()
  ax1.set_yticks(np.arange(len(WordList)))
  ax1.set_yticklabels(WordList)
  ax1.set_xticks(np.arange(0,len(TimeList),5))
  ax1.set_xticklabels(TimeList,rotation='vertical')

  ax1.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax1.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)

  #TweetVolumeDistributionOverHotWords
  ax2.grid(True, 'major', color='w', linestyle='-', linewidth=0.7)
  ax2.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.35)
  ax2.set_axis_bgcolor('0.95')

  ax2.invert_xaxis()
  ax2.barh(np.arange(len(WordList)),TweetCountArray,align='center')

  #add the numbers to the side of each bar
  PreviousValue = None
  for p, ch in zip(np.arange(len(WordList)), TweetCountArray):
      if ch!=PreviousValue:
        ax2.annotate(str(ch), xy=(ch + 2.5, p - 0.25), va='center')
        PreviousValue = ch
      else:
        continue


  ax2.set_yticks(np.arange(len(WordList)))
  ax2.set_yticklabels(WordList)#,rotation='horizontal')
  ax2.set_ylim(0,len(WordList)-1+0.25)

  #Plot table with assisting information
  #1. Date : Day, Date Year and TIME_START to TIME_END
  #2. TIME_START
  #3. TIME_END
  #4. TIME_WINDOW
  #5. No. of HotWords per TimeWindow
  #6. Total No. of unique HotWords Found
  #7. Max #of Tweets for HotWord & HotWord
  #8. Min #of Tweets for HotWord & HotWord
  #9. Max #of Tweets in a timeWindow & timeWindow
  #10.Mix #of Tweets in a timeWindow & timeWindow

  rowLabels = ['1. Date','2. Start time','3. End time','4. Time Window (seconds)','5. No.Of HotWords per TimeWindow','6. No. of unique hotwords','7. Max #of tweets for HotWord','8. Min #of tweets for HotWord','9. Max #of tweets in a time window','10. Min #of tweets in a time window']
  DateStart = gmt_to_local(TIME_START,make_string=True,format='%a %d %b %Y')
  DateEnd   = gmt_to_local(TIME_END,make_string=True,format='%a %d %b %Y')
  Date      = DateStart if DateStart==DateEnd else DateStart+' to '+DateEnd
  start_time= gmt_to_local(TIME_START,make_string=True,format='%d %b %H:%M')
  end_time  = gmt_to_local(TIME_END,make_string=True,format='%d %b %H:%M')
  cellText  = [Date,start_time,end_time,scale,HotWordSize,len(set(WordList)),TweetCountArray.max(),TweetCountArray.min(),str(max(volume)),str(min(volume))]
  rowLabels.reverse()
  cellText.reverse()
  colLabels = ['Value']
  for y, label, text in zip(range(len(cellText)),rowLabels,cellText):
    ax3.text(0.05,(float(y)/20)+0.05,s='%s : %s'%(label,text),size=20)
  ax3.xaxis.set_visible(False)
  ax3.yaxis.set_visible(False)

  fig1.savefig('%s_to_%spng'%(start_time,end_time),dpi=200,bbox_inches="tight")
  plt.close(fig1)