Пример #1
0
    def __init__(self, hashtags, session, engine):
        StreamListener.__init__(self)
        self.cpt = 0   # FIXME: test if useful
        self.eu = EncodingUtils()

        self.hashtags = self.format_hashtags(hashtags)
        self.session = session  # bridge to the db
        self.engine = engine
Пример #2
0
    def __init__(self, author, created, inserted, source, text):
        self.eu = EncodingUtils()  # used to switch to unicode

        self.author = self.eu.to_unicode(author)
        self.created = self.eu.to_unicode(created)
        self.crawled = False
        self.inserted = inserted
        self.source = self.eu.to_unicode(source)
        self.hashtag = self.eu.to_unicode('')
        self.text = self.eu.to_unicode(text)

        self.hashtags = self.extract_hashtags()

        self.invalid = False  # cannot be invalid by default
Пример #3
0
#! /usr/bin/env python
# ! coding=utf-8
# ! author scq000

from pyrailgun import RailGun
import json
import sys
from encodingUtils import EncodingUtils

reload(sys)
sys.setdefaultencoding('utf8')

encodingUtils = EncodingUtils()
railgun = RailGun(encodingUtils)
railgun.setTask(file("sites.json"))
railgun.fire()
nodes = railgun.getShells('default')

file = file("result.txt", "w+")
for item in nodes:
    node = nodes[item]
    # print node
    file.write(node.get('name', [""])[0] + "\r\n")
    file.write(node.get('src', [""])[0] + "\r\n")
    file.write(node.get('magnet', [""])[0] + "\r\n")
    file.write(node.get('thunder', [""])[0] + "\r\n")
    file.write(node.get('size', [""])[0] + "\r\n")
    file.write(
        node.get('updateTime', [""])[0] +
        "\r\n====================================\n")
Пример #4
0
class StreamSaverListener(StreamListener):
    """
    Stream that will save each tweet it receives into a database
    to be reused later
    """
    def __init__(self, hashtags, session, engine):
        StreamListener.__init__(self)
        self.cpt = 0   # FIXME: test if useful
        self.eu = EncodingUtils()

        self.hashtags = self.format_hashtags(hashtags)
        self.session = session  # bridge to the db
        self.engine = engine

    def on_status(self, status):
        """
        Each time a tweet is received
        """
        tweet = Tweet(status.author.screen_name,
            status.created_at,
            datetime.datetime.now(),
            status.source,
            status.text)

        tweet.get_main_tag(self.hashtags)  # FIXME: should be part of the init, shouldn t it ?

        # adds current tweet to the tweet table for logging.
        self.session.add(tweet)
        # here i should update members now.
        self.update_members(tweet)

        #self.cpt += 1

        if self.cpt >= 1:
            self.session.commit()  # force saving changes
            self.cpt = 0

    def on_error(self, status_code):
        print 'An error has occured! Status code = %s' % status_code
        return True  # keeps stream alive

    def on_timeout(self):
        print 'Snoozing Zzzzzz'

    def on_delete(self):
        return False

    def format_hashtags(self, hashs):
        """
        Returns the same list of hashtags in unicode format
        """
        return [self.eu.to_unicode(has) for has in hashs]

    def update_members(self, tweet):
        """
        Updates the member table using the last tweet received.
        If Member already exists and has already used the hashtag, its counter will be incremented.
        If member doesnt exist yet for the hashtag, it will be created.
        """
        auth = tweet.author
        hasht = tweet.hashtag
        m_query = self.session.query(Member).filter(Member.author == auth).filter(Member.hashtag == hasht)

        reslen = len(m_query.all())
        if reslen > 1:
            print "Error: Duplicate members found."
        elif reslen == 0:
            print "No member found, creating"
            self.create_member(tweet)
        else:  # reslen = 1
            print "Member found, updating"
            self.update_member(m_query.first())

    def create_member(self, tweet):
        """
        Creates a new Member using data from the given Tweet
        Called when no Member is found for the current
        author/hashtag couple.
        """
        if (tweet.has_author() and tweet.has_hashtag()):
            member = Member(tweet.author, tweet.hashtag, 1)
            self.session.add(member)

            self.cpt += 1
        else:
            #self.logger.error("ElementException :  Cannot create Member, Tweet is not valid !")
            print "ElementException :  Cannot create Member, Tweet is not valid !"
            #raise ElementException  # FIXME : Take care
            pass

    def update_member(self, member):
        """
        Updates member values.
        Increments counter by 1, and changes updated field
        """
        if (member.has_author() and member.has_hashtag()):
            member.update()
            self.session.add(member)

            self.cpt += 1
        else:
            #self.logger.error("ElementException :  Cannot update Member, Member is not valid !")
            print "ElementException :  Cannot update Member, Member is not valid !"
            raise ElementException  # FIXME : Take care
Пример #5
0
class Tweet(Base):
    """
    Class that fully represents a tweet as it is stored in the database.
    It is different from the structure that can be found in tweepy
    """
    __tablename__ = "tweet"
    id = Column(Integer, primary_key=True)
    hashtag = Column(String(200))  # Hashtag that is tracked
    text = Column(String(200))  # Content of the tweet
    author = Column(String(200))  # name of the tweeter
    created = Column(String(200))  # FIXME: Change to date. Date at which message was tweeted
    inserted = Column(DateTime)  # Date at which tweet was saved in db
    crawled = Column(Boolean)  # Boolean whether or not tweet is in statistics already
    source = Column(String(200))  # Where tweet comes from

    # Boolean that is set to True if Tweet cannot be processed correctly
    invalid = Column(Boolean)

    def __init__(self, author, created, inserted, source, text):
        self.eu = EncodingUtils()  # used to switch to unicode

        self.author = self.eu.to_unicode(author)
        self.created = self.eu.to_unicode(created)
        self.crawled = False
        self.inserted = inserted
        self.source = self.eu.to_unicode(source)
        self.hashtag = self.eu.to_unicode('')
        self.text = self.eu.to_unicode(text)

        self.hashtags = self.extract_hashtags()

        self.invalid = False  # cannot be invalid by default

    def extract_hashtags(self):
        """
        Extracts all the hashtags that are present in the tweet
        FIXME: Problem here is that we lose lots of tags because they end/start
        with special characters!
        """
        return set(part[:] for part in self.text.split() if part.startswith('#'))
        #return re.findall(r"#(\w+)", self.text)

    def get_main_tag(self, trendy):
        """
        Given a list of tracked hashtag, defines the most important one
        """
        in_hashs = [i.lower() for i in self.hashtags]
        trend_hashs = [i.lower() for i in trendy]
        match = [i for i in in_hashs if i in trend_hashs]
        if len(match) != 0:
            self.hashtag = self.eu.to_unicode(match[0])

    def has_author(self):
        """
        Returns True if author is not empty or null
        """
        return (len(self.author) != 0 and self.author is not None)

    def has_hashtag(self):
        """
        Returns True if hashtag is not empty or null
        """
        return (len(self.hashtag) != 0 and self.hashtag is not None)

    def __repr__(self):
            try:
                return "<%s('%s','%s', '%s')>" % (self.author.encode('utf-8'), self.created.encode('utf-8'), self.hashtag.encode('utf-8'), self.text.encode('utf-8'))
            except UnicodeDecodeError:
                return "Contains Unicode!!"