Exemplo n.º 1
0
    def handle(self, *args, **kwargs):
        try:
            low = int(args[0])
            high = int(args[1])
        except (ValueError, IndexError):
            print self.args
            return

        chunk = 1000

        for x in range(low, high, chunk):
            print x, "/", (high), (float(x - low) / (high - low)) * 100
            for tweet in Tweet.objects.all()[x:x + chunk]:
                results = re.finditer(link_regex, tweet.text, re.I | re.DOTALL)
                for match in results:
                    short_url = match.group(2).split("&")[0]
                    try:
                        url = ShortUrl.objects.get(short=short_url).url
                    except ShortUrl.DoesNotExist:
                        url, created = CanonicalUrl.objects.get_or_create(
                            url=get_long_url(short_url))
                        ShortUrl.objects.get_or_create(
                            short=short_url,
                            url=url,
                        )
                    #print tweet.pk, ":", short_url, "=>", url
                    #print tweet.pk
                    tweet.urls.add(url)
Exemplo n.º 2
0
def parse_tweet(tweet_id):
    print tweet_id
    return
    tweet = Tweet.objects.get(pk=tweet_id)
    results = re.finditer(link_regex, tweet.text, re.I | re.DOTALL)
    for match in results:
        short_url = match.group(2).split("&")[0]
        # Retry transaction
        try:
            url = ShortUrl.objects.get(short=short_url).url
        except ShortUrl.DoesNotExist:
            # This is blocking and long (waiting for network)
            long_url = get_long_url(short_url)
            url, created = CanonicalUrl.objects.get_or_create(url=long_url)
            # Use get_or_create because another thread might have created it
            # while we were getting the long url.
            ShortUrl.objects.get_or_create(short=short_url, url=url)
        finally:
            tweet.urls.add(url)
Exemplo n.º 3
0
def parse_tweet(tweet_id):
    print tweet_id
    return
    tweet = Tweet.objects.get(pk=tweet_id)
    results = re.finditer(link_regex, tweet.text, re.I | re.DOTALL)
    for match in results:
        short_url = match.group(2).split("&")[0]
        # Retry transaction
        try:
            url = ShortUrl.objects.get(short=short_url).url
        except ShortUrl.DoesNotExist:
            # This is blocking and long (waiting for network)
            long_url = get_long_url(short_url)
            url, created = CanonicalUrl.objects.get_or_create(
                url=long_url
            )
            # Use get_or_create because another thread might have created it
            # while we were getting the long url.
            ShortUrl.objects.get_or_create(
                short=short_url,
                url=url
            )
        finally:
            tweet.urls.add(url)
Exemplo n.º 4
0
    def handle(self, *args, **kwargs):
        try:
            low = int(args[0])
            high = int(args[1])
        except (ValueError, IndexError):
            print self.args
            return

        chunk = 1000

        for x in range(low, high, chunk):
            print x, "/", (high), (float(x - low) / (high - low)) * 100
            for tweet in Tweet.objects.all()[x : x + chunk]:
                results = re.finditer(link_regex, tweet.text, re.I | re.DOTALL)
                for match in results:
                    short_url = match.group(2).split("&")[0]
                    try:
                        url = ShortUrl.objects.get(short=short_url).url
                    except ShortUrl.DoesNotExist:
                        url, created = CanonicalUrl.objects.get_or_create(url=get_long_url(short_url))
                        ShortUrl.objects.get_or_create(short=short_url, url=url)
                    # print tweet.pk, ":", short_url, "=>", url
                    # print tweet.pk
                    tweet.urls.add(url)