try:
        os.makedirs(index_path)
    except os.error:
        raise Exception(index_path + " could not be created.")  
    
#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"
t.search_for(search_hashtags)
from_date=datetime.datetime(2011, 01, 27, 23, 55, 0)
to_date=datetime.datetime(2011, 01, 29, 0, 0, 0)
t.search_between(from_date=from_date, 
                 to_date=to_date, 
                 granularity_days=0, 
                 granularity_hours=0, 
                 granularity_mins=5)
t.retrieve_items_of_type(EgyptTweet)
t.crawl(only_english=True)

#Index all the documents
docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet)
index = Index(index_path)
print 'Started indexing'
index.add_documents(docs)
index.finalize()
print 'Started indexing'
for term in index.get_top_terms(limit=100):
    print term
Пример #2
0
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0)
items = ws.get_documents_by_date(from_date, to_date, limit=100)
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets
# but they are not yet in the database. They'll be considered after all authors have
#been stored.
mentions_of_not_stored_users = []

for author_name in screen_names:
    author = Author.objects(screen_name=author_name)
    if len(author) == 0:  #If not in db yet
        tweets = EgyptTweet.objects(author_screen_name=author_name)
        author = Author()
Пример #3
0
    except os.error:
        raise Exception(index_path + " could not be created.")

#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"

t.search_for(search_hashtags)
##Last update ended at 2011-01-27 09:00:00
from_date = datetime.datetime(2011, 01, 24, 0, 0, 0)
to_date = datetime.datetime(2011, 01, 25, 0, 0, 0)
t.search_between(from_date=from_date,
                 to_date=to_date,
                 granularity_days=0,
                 granularity_hours=0,
                 granularity_mins=5)
t.retrieve_items_of_type(EgyptTweet)
t.crawl(only_english=True)

#Index all the documents
docs = ws.get_documents_by_date(from_date, to_date, type=EgyptTweet)
index = Index(index_path)
print 'Started indexing'
index.add_documents(docs)
index.finalize()
print 'Started indexing'
for term in index.get_top_terms(limit=100):
    print term
@author: george
'''

import datetime, unittest 
from database.warehouse import WarehouseServer
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tools.utils import aggregate_data
from matplotlib.dates import num2date#!@UnresolvedImport
from visualizations.graphs import D3Timeline


ws = WarehouseServer()
from_date = datetime.datetime(2011, 1, 26, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) 
items = ws.get_documents_by_date(from_date, to_date, limit=3000)

oc = OrangeKmeansClusterer(k=100, ngram=1)
oc.add_documents(items)
oc.run("orange_clustering_test", pca=False)

top_clusters = []
for cluster in oc.clusters:
    documents = cluster.get_documents().values()
    if len(documents) == 0 : continue
    dates = [doc.date for doc in documents]
    delta = max(dates) - min(dates)
    delta_seconds = delta.total_seconds()
    if delta_seconds == 0: continue
    rate_growth = float(len(dates))/delta_seconds
    top_clusters.append( (rate_growth, max(dates), cluster) )
'''
Created on 22 Mar 2012

@author: george

This script allow us to annotate known events with their labels
'''
import datetime
from database.warehouse import WarehouseServer
from mongoengine import connect
connect("pythia_db")
from evaluation.evaluators import AbstractEvaluator
ws = WarehouseServer()

from_date=datetime.datetime(2011, 01, 25, 12, 0, 0)
to_date=datetime.datetime(2011, 01, 25, 12, 5, 0)
tweet_list = ws.get_documents_by_date(from_date, to_date)
ce = AbstractEvaluator(tweet_list)
ce.annotate_dataset()