Пример #1
0
class TweetDumper(object):
    """
    This class is able to retrieve tweets from the user. If you need to update
    a preexisting database just run with page=0 and interrupt the script as
    soon you see Skipping warning.
    """
    ARGS = ('user', 'page')
    DESC = "Retrieve tweets of <user> starting from <page>"

    METHOD = 'save_tweet'

    URL = "http://api.twitter.com/1/statuses/user_timeline.json?" \
          "&count=200&page={:d}"

    def __init__(self):
        self.url = self.URL
        self.collector = Collector()
        self.invoker = Requester('proxylist')

    def dump(self, politician, page=1):
        try:
            page = int(page)
            url = self.url

            if politician.isdigit():
                url += '&user_id={:s}'
            else:
                url += '&screen_name={:s}'

            while True:
                print("Retrieving tweets at page {:d}".format(page))

                response, content = self.invoker.request(
                    url.format(page, politician)
                )

                collection = json.loads(content)

                if len(collection) == 0:
                    break

                meth = getattr(self.collector, self.METHOD)
                meth.__call__(collection)

                page += 1
        finally:
            print("Committing changes to the database")
            self.collector.save()
            self.invoker.save('proxylist')
Пример #2
0
    def __init__(self, userlist):
        self.filename = userlist
        self.userlist = [int(line.strip()) for line in open(userlist).read().splitlines()]

        self.total = float(len(self.userlist))
        self.current = 0

        self.dumpfile = GzipFile(userlist + ".json.gz", "a")

        self.invoker = Requester()
Пример #3
0
class TweetDumper(object):
    URLS = ("http://api.twitter.com/1/statuses/user_timeline.json?" \
            "&count=200&page={:d}",
            "http://api.twitter.com/1/statuses/retweeted_by_user.json" \
            "?count=100&page={:d}")

    def __init__(self, userlist):
        self.filename = userlist
        self.userlist = [int(line.strip()) for line in open(userlist).read().splitlines()]

        self.total = float(len(self.userlist))
        self.current = 0

        self.dumpfile = GzipFile(userlist + ".json.gz", "a")

        self.invoker = Requester()

    def run(self):
        try:
            while self.userlist:
                self.current += 1
                self.dump(self.userlist[0], 1)
                self.userlist.pop(0)
                self.save_progress()
        finally:
            self.save_progress()
            self.close_dump()

    def close_dump(self):
        self.dumpfile.close()

    def save_progress(self):
        f = open(self.filename, "w")
        for i in self.userlist:
            f.write(str(i) + "\n")
        f.close()

    def dump(self, politician, page=1):
        page = int(page)
        oldpage = page
        politician = str(politician)

        msgs = (
            'Retrieving tweets at page {:d} for user {:s}',
            'Retrieving retweets at page {:d} for user {:s}'
        )

        print("Percentage: %.2f" % (self.current / self.total))

        for url, msg in zip(self.URLS, msgs):
            if politician.isdigit():
                url += '&user_id={:s}'
            else:
                url += '&screen_name={:s}'

            while True:
                print(msg.format(page, politician))

                response, content = self.invoker.request(
                    url.format(page, politician)
                )

                if response['status'] == '401':
                    break

                try:
                    collection = json.loads(content)
                except:
                    print "Error decoding json", response['status']
                    continue

                if len(collection) == 0:
                    break

                self.dumpfile.write(content + "\n")

                page += 1

            page = oldpage
Пример #4
0
 def __init__(self):
     self.url = self.URL
     self.collector = Collector()
     self.invoker = Requester('proxylist')