Пример #1
0
 def get_authors(self):
     '''
     Returns the authors of the documents that appear in this cluster.
     '''
     ws = WarehouseServer()
     authors = set(ws.get_document_authors(self.document_dict.keys()))
     return list(authors)
Пример #2
0
 def get_authors(self):
     '''
     Returns the authors of the documents that appear in this cluster.
     '''
     ws = WarehouseServer()
     authors = set(ws.get_document_authors(self.document_dict.keys()))
     return list(authors)
    def test_author_classification_egypt_dataset(self):
        TestAuthor.drop_collection()    
        ws = WarehouseServer()      
        for author in [author for author in ws.get_authors(type=Author)]:
            if len(author.tweets) > 200:
                t = TestAuthor()
                t.screen_name = author.screen_name
                t.tweets = author.tweets
                t.save()
            
        
        authors = ws.get_authors(type=TestAuthor)
        for author in authors:
            print '-----------------------'
            print author.screen_name
            vector = author.update_feature_vector()
            print vector
        
        classifier = TreeClassifier()
        attributes = ["retweets", "links", "retweeted", "replies", "mentions", "ff-ratio", "class"]
        train_set = numpy.array([author.get_feature_vector_with_type() for author in TrainingAuthor.objects])

        classifier.train(train_set, attributes)
        
        for author in authors:
            prediction = "No prediction"
            if len(author.feature_vector) > 0:
                prediction = classifier.classify(author.get_feature_vector_with_type())
            print author.screen_name
            print prediction
            print '----------------------'
            
        TestAuthor.drop_collection()   
Пример #4
0
def output_clusters_to_file(clusters, rownames, filename):
    '''
    DEPRECATED
    This method takes as input a set of clusters and generates 
    a very simplistic representation of these clusters in text form
    in a file. 
    '''
    
    ws = WarehouseServer()
    out = file(filename, 'w')
    out.write("Clustering results")
    out.write('\n')
    i = 0 
    for cluster in clusters:
        out.write('\n')
        out.write('***********************************************************')
        out.write('\n')
        out.write("Cluster"+str(i))
        out.write('\n')
        for document in cluster:
            out.write( ws.get_document_by_id(rownames[document]).content)
            out.write('\n')
        i += 1
Пример #5
0
def output_clusters_to_file(clusters, rownames, filename):
    '''
    DEPRECATED
    This method takes as input a set of clusters and generates 
    a very simplistic representation of these clusters in text form
    in a file. 
    '''

    ws = WarehouseServer()
    out = file(filename, 'w')
    out.write("Clustering results")
    out.write('\n')
    i = 0
    for cluster in clusters:
        out.write('\n')
        out.write(
            '***********************************************************')
        out.write('\n')
        out.write("Cluster" + str(i))
        out.write('\n')
        for document in cluster:
            out.write(ws.get_document_by_id(rownames[document]).content)
            out.write('\n')
        i += 1
Пример #6
0
# -*- coding: utf-8 -*-
'''
Created on 23 Jan 2012

@author: george

My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()

class TestPlayground(unittest.TestCase):
  
    def test_searching(self):        
        results = index.search_by_term("sales")
        
        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))
Пример #7
0
'''
Created on 26 Jan 2012

@author: george
'''
import unittest, datetime
from analysis.clustering.dbscan import DBSCANClusterer
from database.warehouse import WarehouseServer
from collections import OrderedDict

###########################################
# GLOBALS                                #
###########################################
ws = WarehouseServer()
epsilon = 2.0
min_pts = 2.0
points = []
points.append([1,1])
points.append([1.5,1])
points.append([1.8,1.5])
points.append([2.1,1])
points.append([3.1,2])
points.append([4.1,2])
points.append([5.1,2])
points.append([10,10])
points.append([11,10.5])
points.append([9.5,11])
points.append([9.9,11.4])
points.append([15.0, 17.0])
points.append([15.0, 17.0])
points.append([7.5, -5.0])
'''
Created on 22 Jan 2012

@author: george
'''
import datetime, os
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import EgyptTweet
from analysis.index import Index
from database.warehouse import WarehouseServer

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
ws = WarehouseServer()
index_path = os.path.join(BASE_PATH,"egypt_index")
if not os.path.exists(index_path):
    try:
        os.makedirs(index_path)
    except os.error:
        raise Exception(index_path + " could not be created.")  
    
#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"
t.search_for(search_hashtags)
from_date=datetime.datetime(2011, 01, 27, 23, 55, 0)
to_date=datetime.datetime(2011, 01, 29, 0, 0, 0)
t.search_between(from_date=from_date, 
                 to_date=to_date, 
'''
Created on 21 Mar 2012

@author: george
'''

from database.warehouse import WarehouseServer
from database.model.tweets import EvaluationTweet
from analysis.clustering.kmeans import OrangeKmeansClusterer
from evaluation.evaluators import ClusteringEvaluator

ws = WarehouseServer()
documents = ws.get_all_documents(type=EvaluationTweet)

oc = OrangeKmeansClusterer(k=35, ngram=1)
ebe = ClusteringEvaluator(documents)
bcubed_precision, bcubed_recall, bcubed_f = ebe.evaluate(clusterer=oc)
print bcubed_precision, bcubed_recall, bcubed_f
Пример #10
0
@author: george
'''
import datetime
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import *
from database.model.agents import *
from mongoengine import *
import tools.utils
from urlparse import urlparse
from database.warehouse import WarehouseServer

f = CrawlerFactory()
twitter = f.get_crawler("twitter")
#twitter.login()
ws = WarehouseServer()

from_date = datetime.datetime(2011, 1, 25, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 26, 0, 00, 0)
items = ws.get_documents_by_date(from_date, to_date, limit=100)
screen_names = []
for tweet in items:
    screen_names.append(tweet.author_screen_name)
screen_names = set(screen_names)
print len(screen_names)
# A terrible hack to save the screen_names of users which are mentioned in tweets
# but they are not yet in the database. They'll be considered after all authors have
#been stored.
mentions_of_not_stored_users = []

for author_name in screen_names:
'''
Created on 24 Mar 2012

@author: george
'''

import datetime, unittest 
from database.warehouse import WarehouseServer
from analysis.clustering.kmeans import OrangeKmeansClusterer
from tools.utils import aggregate_data
from matplotlib.dates import num2date#!@UnresolvedImport
from visualizations.graphs import D3Timeline


ws = WarehouseServer()
from_date = datetime.datetime(2011, 1, 26, 0, 0, 0)
to_date = datetime.datetime(2011, 1, 27, 0, 0, 0) 
items = ws.get_documents_by_date(from_date, to_date, limit=3000)

oc = OrangeKmeansClusterer(k=100, ngram=1)
oc.add_documents(items)
oc.run("orange_clustering_test", pca=False)

top_clusters = []
for cluster in oc.clusters:
    documents = cluster.get_documents().values()
    if len(documents) == 0 : continue
    dates = [doc.date for doc in documents]
    delta = max(dates) - min(dates)
    delta_seconds = delta.total_seconds()
    if delta_seconds == 0: continue
Пример #12
0
# -*- coding: utf-8 -*-
'''
Created on 23 Jan 2012

@author: george

My playground!
'''
import unittest, os
from analysis.index import Index
from database.warehouse import WarehouseServer
from database.model.tweets import TwoGroupsTweet

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
index_path = BASE_PATH + "test_index"
ws = WarehouseServer()
sample_docs = ws.get_n_documents(100, type=TwoGroupsTweet)

index = Index(index_path)
for doc in sample_docs:
    index.add_document(doc)
index.finalize()


class TestPlayground(unittest.TestCase):
    def test_searching(self):
        results = index.search_by_term("sales")

        calculated = []
        for doc in results:
            calculated.append(doc.get('id'))
Пример #13
0
'''
Created on 22 Jan 2012

@author: george
'''
import datetime, os
from crawlers.CrawlerFactory import CrawlerFactory
from database.model.tweets import EgyptTweet
from analysis.index import Index
from mongoengine import *
from database.warehouse import WarehouseServer

BASE_PATH = os.path.expanduser("~/virtualenvfyp/pythia/data/")
ws = WarehouseServer()
index_path = os.path.join(BASE_PATH, "egypt_index")
if not os.path.exists(index_path):
    try:
        os.makedirs(index_path)
    except os.error:
        raise Exception(index_path + " could not be created.")

#Save the tweets in the db
f = CrawlerFactory()
t = f.get_crawler("topsy")

search_hashtags = "#25jan OR #jan25 OR #egypt OR #tahrir OR #fuckmubarak OR #mubarak \
                   OR #suez OR #DownWithMubarak OR #NOSCAF OR #SCAF OR #cairo"

t.search_for(search_hashtags)
##Last update ended at 2011-01-27 09:00:00
from_date = datetime.datetime(2011, 01, 24, 0, 0, 0)
Пример #14
0
'''
Created on 22 Mar 2012

@author: george

This script allow us to annotate known events with their labels
'''
import datetime
from database.warehouse import WarehouseServer
from mongoengine import connect
connect("pythia_db")
from evaluation.evaluators import AbstractEvaluator
ws = WarehouseServer()

from_date=datetime.datetime(2011, 01, 25, 12, 0, 0)
to_date=datetime.datetime(2011, 01, 25, 12, 5, 0)
tweet_list = ws.get_documents_by_date(from_date, to_date)
ce = AbstractEvaluator(tweet_list)
ce.annotate_dataset()
'''
Created on 21 Mar 2012

@author: george
'''
import numpy
from database.warehouse import WarehouseServer
from analysis.classification.tree import TreeClassifier
from database.model.agents import TrainingAuthor
from evaluation.evaluators import ClassificationEvaluator

ws = WarehouseServer()
authors = ws.get_all_documents(type=TrainingAuthor)
ce = ClassificationEvaluator(authors, ["Celebrity", "Media", "Journalists", "Activists", "Commoner"])
metrics = ce.evaluate(classifier=TreeClassifier(), K=10)
print metrics