Exemplo n.º 1
0
def load_watchlist():
    global watchlist, wl_hash, has_keywords
    wl = []
    for x in open(args.watchlist, 'r').readlines():
        x = x.rstrip().lower()
        if x.startswith(';'):
            username = x[1:]
            disabled_users[username] = True
        else:
            username = x
            if username[0] == '#' and not has_keywords:
                has_keywords = True
        if not username[0] == '#' and not os.path.exists(
                paths.get_user_json(username)):
            new_accounts.append(username)
            if not os.path.exists(paths.get_user(username)):
                retry_makedirs(paths.get_user(username))
        wl.append(username)
    newhash = hashlib.md5(''.join(wl)).hexdigest()
    if newhash != wl_hash:
        print('reloading watchlist')
        wl_hash = newhash
        watchlist = wl
        json_loads()

    if has_keywords and os.path.exists('users'):
        for file in os.listdir('users'):
            d = os.path.join('users', file)
            if os.path.isdir(d): load_user_json(file)
Exemplo n.º 2
0
def load_watchlist():
	global watchlist, wl_hash
	wl = []
	for x in open(args.watchlist, 'r').readlines():
		x = x.rstrip()
		if x.startswith(';'):
			username = x[1:]
			disabled_users[username] = True
		else:
			username = x
		if not os.path.exists(paths.get_user_json(username)):
			new_accounts.append(username)
			if not os.path.exists(paths.get_user(username)):
				retry_makedirs(paths.get_user(username))
		wl.append(username)
	newhash = hashlib.md5(''.join(wl)).hexdigest()
	if newhash != wl_hash:
		print('reloading watchlist')
		wl_hash = newhash
		watchlist = wl
		json_loads()
Exemplo n.º 3
0
def scrape(user, http, host):
	global nitters

	if user in new_accounts:
		count = args.count
		checkfn = None
		new_accounts.remove(user)
	else:
		checkfn = fetch_more_tweets_callback
		count = -1

	elapsed_time = time.time()
	insert_pos = 0
	sys.stdout.write('\r[%s] scraping %s... ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user))
	sys.stdout.flush()

	twats, nitters, host, http = get_twats(user, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host)

	new = False
	for t in twats:
		if not in_twatlist(user, t):
			new = True
			if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners)
			add_twatlist(user, t, insert_pos)
			insert_pos += 1
			if 'quote_tweet' in t:
				if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user']))
				fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters)
			if 'user' in t:
				if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user']))
				fetch_profile_picture(t['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters)
			if args.mirror: mirror_twat(t, args=args)
			sys.stdout.write('\r[%s] scraping %s... +%d ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user, insert_pos))
			sys.stdout.flush()

	if new: write_user_tweets(user)
	elapsed_time = (time.time() - elapsed_time)
	sys.stdout.write('done (%s)\n' % get_timestamp("%H:%M:%S", elapsed_time))
	sys.stdout.flush()
	return http, host
Exemplo n.º 4
0
def htmlize_twat(twat, variables, quoted=False):
	tw = '<div class="twat-container">'
	tweet_pic = None
	retweet_pic = None

	if not 'rid' in twat:
		retweet_str = ""
		if paths.has_profile_pic(twat['owner']): tweet_pic = paths.get_profile_pic(twat['owner'])

	else:
		if paths.has_profile_pic(twat['user']): tweet_pic = paths.get_profile_pic(twat['user'])
		else: tweet_pic = ""

		if paths.has_profile_pic(twat['owner']): retweet_pic = paths.get_profile_pic(twat['owner'])

		retweet_str = " (RT %s<a target='_blank' href='https://twitter.com/%s/status/%s'>%s</a>)" % \
		(user_at_link(twat['user']), twat['user'], twat['id'], twat['user'])

	if tweet_pic: tw += '<div class="profile_picture"><img width="100%%" height="100%%" src="%s"></div>' % tweet_pic
	if retweet_pic: tw += '<div class="profile_picture_retweet"><img width="100%%" height="100%%" src="%s"></div>' % retweet_pic

	user_str =  user_at_link(twat["owner"])
	user_str += "<a target='_blank' href='https://twitter.com/%s/status/%s'>%s</a>%s" % \
	(twat["owner"], get_effective_twat_id(twat), twat["owner"], retweet_str)


	tw += '\n<div class="twat-title">'

	## add icon bar
	if args.iconbar: tw += build_iconbar(twat, variables, quoted)

	time_str = 'unknown' if twat["time"] == 0 else format_time(twat["time"])
	tw += '%s&nbsp;-&nbsp;%s' % (user_str, time_str)

	tw += '\n</div>\n'

	## replace urls in twats
	twat['text'] = replace_url_in_twat(twat, args=args)
	## strip html ?
	if args.nohtml: twat['text']= strip_tags(twat['text'])

	tw += '<p class="twat-text">%s</p>\n' % (replace_twat_text(twat['text']))

	if 'curl' in twat and args.iframe > 0:
		user = twat['user'].lower()
		ifu = paths.get_user(user) + '/%s-%s' % (twat['id'], "card.html")
		if (not 'c' in args.mirror) or (not file_exists(ifu)):
			ifu = twat['curl']
		tw += '<span class="twat-iframe"><iframe src="%s"></iframe></span>\n'%ifu

	if 'images' in twat:
		tw += '<p class="twat-image">'
		if len(twat['images']) > 1: wdth = (100/len(twat['images'])) - 1
		else: wdth = 100

		for i in twat['images']:
			if args.images <= 0:
				tw += '<a href="%s">%s</a>'%(i, i)
			else:
				img_path = paths.get_user(twat['user']) + "/%s-%s" % (twat['id'], i.split('/')[-1])
				if not file_exists(img_path): img_path = i
				span_or_div = "span"
				img_class = "img"
				div_class = ""
				if args.upstream_img:
					href = i
					title = "view remote image"
				elif 'video' in twat or 'ext_tw_video_thumb' in i:
					mp4_path = paths.get_user(twat['user']) + '/%s.mp4' % str(twat['id'])
					if os.path.exists(mp4_path):
						href = mp4_path
						title = "view local video"
					else:
						href = "https://twitter.com/i/status/" + twat['id']
						title = "view remote video"
					img_class = ""
					div_class = "video-thumbnail"
					span_or_div = "div"
				else:
					href = img_path
					title = "view local image"
				tw += '<a href="%s" title="%s"><%s class="%s"><img class="%s" src="%s" width="%d%%"></%s></a>' % (href, title, span_or_div, div_class, img_class, img_path, wdth, span_or_div)

		tw += '</p>\n'

	if 'quote' in twat:
		pseudo_twat = {
			'user' : twat['quote']['user'],
			'owner' : twat['quote']['user'],
			'id' : twat['quote']['id'],
			'text' : twat['quote']['text'],
			'time' : 0
		}
		tw += htmlize_twat(pseudo_twat, variables, quoted=True)

	tw += '</div>\n'

	return tw
Exemplo n.º 5
0
def scrape(item, http, host, search, user_agent):
    global nitters
    global mastodon_rshttp
    item = item.lower()

    if item in new_accounts:
        count = args.count
        checkfn = None
        new_accounts.remove(item)
    else:
        checkfn = fetch_more_tweets_callback
        count = args.count if item[0] == '#' else -1

    if item.count('@') < 2:
        platform = 'twitter'
        twats, nitters, host, http, page = get_twats(item,
                                                     proxies=args.proxy,
                                                     count=count,
                                                     http=http,
                                                     checkfn=checkfn,
                                                     nitters=nitters,
                                                     host=host,
                                                     search=search,
                                                     user_agent=user_agent,
                                                     blacklist=blacklist,
                                                     whitelist=whitelist)
    else:
        platform = 'mastodon'
        twats, http = get_toots(item,
                                proxies=args.proxy,
                                count=count,
                                http=http,
                                checkfn=checkfn,
                                user_agent=user_agent,
                                blacklist=args.blacklist,
                                whitelist=args.whitelist)
        mastodon_rshttp[host] = http

    insert_pos = dict()
    new = False
    user = None if item[0] == '#' else item
    insert_pos_total = 0
    elapsed_time = time.time()
    for t in twats:
        if item[0] == '#': user = t['user'].lower()
        if not user in insert_pos: insert_pos[user] = 0

        if not in_twatlist(user, t):
            new = True
            if args.unshorten:
                t = unshorten_urls(t,
                                   proxies=args.proxy,
                                   shorteners=shorteners)
            add_twatlist(user, t, insert_pos[user])
            insert_pos[user] += 1
            insert_pos_total += 1
            if 'quote_tweet' in t:
                if '@' in t['quote_tweet']['user']:
                    _, foo, bar = t['quote_tweet']['user'].split('@')
                    http = None if not bar in mastodon_rshttp else mastodon_rshttp[
                        bar]

                if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])):
                    retry_makedirs(paths.get_user(t[quote_tweet]['user']))
                fetch_profile_picture(t[quote_tweet]['user'],
                                      args.proxy,
                                      twhttp=http,
                                      nitters=nitters,
                                      platform=platform)
            if 'user' in t:
                if '@' in t['user']:
                    _, foo, bar = t['user'].split('@')
                    http = None if not bar in mastodon_rshttp else mastodon_rshttp[
                        bar]

                if not os.path.isdir(paths.get_user(t['user'])):
                    retry_makedirs(paths.get_user(t['user']))
                fetch_profile_picture(t['user'],
                                      args.proxy,
                                      twhttp=http,
                                      nitters=nitters,
                                      platform=platform)
            if args.mirror: mirror_twat(t, args=args)
            sys.stdout.write(
                '\r[%s] %s: extracting from %d page(s): +%d twat(s)' %
                (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item,
                 page, insert_pos_total))
            sys.stdout.flush()

    if new:
        if item[0] == '#':
            for user in insert_pos.keys():
                write_user_tweets(user)
        else:
            write_user_tweets(item)
    elapsed_time = (time.time() - elapsed_time)
    sys.stdout.write('done (%s)\n' %
                     misc.get_timestamp("%H:%M:%S", elapsed_time))
    sys.stdout.flush()
    return http, host
Exemplo n.º 6
0
def _mirror_file(url_components,
                 user,
                 tid,
                 args=None,
                 content_type=None,
                 force=False):
    outname = paths.get_user(user) + '/%s-%s' % (tid,
                                                 url_components['filename'])
    if not force and os.path.exists(outname):
        return

    http = RsHttp(url_components['host'],
                  ssl=url_components['ssl'],
                  port=url_components['port'],
                  keep_alive=True,
                  follow_redirects=True,
                  auto_set_cookies=True,
                  proxies=args.proxy,
                  user_agent="curl/7.60.0")

    ## do nothing if we cannot connect
    if not http.connect(): return None

    ext = url_components['filename'].split('.')[-1]

    if content_type:

        if args.ext: filtre = str(args.ext).split(',')
        else: filtre = []

        hdr = http.head(url_components['uri'])

        ## max mirror size
        if args.mirror_size:
            # extract second part of the Content-Length: line
            value = [
                str(i.split(':')[1]).strip() for i in hdr.split('\n')
                if i.lower().startswith('content-length:')
            ]
            if not len(value) or int(value[0]) > args.mirror_size: return

        # extract second part of the Content-Type: line
        value = [
            str(i.split(':')[1]).strip() for i in hdr.split('\n')
            if i.lower().startswith('content-type:')
        ]

        ## server does not provide Content-Type info
        if not len(value):
            return
            # content type contains ';' (usually when html)
        elif ';' in value[0]:
            value[0] = value[0].split(';')[0]
        value = value[0].split('/')

        ## when filtering extensions (--ext)
        ## if unset, everything is mirrored
        if len(filtre):
            ## values don't match anything
            if len(value) < 2 or (not value[0] in filtre
                                  and not value[1] in filtre):
                return

        # XXX : mirror html files
        ## we actually don't save html files
        ## what about making automated save
        ## thru the wayback machine ?
        if 'html' in value: return

        ## previous http object cannot be re-used
        http = RsHttp(url_components['host'],
                      ssl=url_components['ssl'],
                      port=url_components['port'],
                      keep_alive=True,
                      follow_redirects=True,
                      auto_set_cookies=True,
                      proxies=args.proxy,
                      user_agent="curl/7.60.0")

        ## do nothing if we cannot connect
        if not http.connect(): return

    extras = []
    if url_components[
            'filename'] == 'card.html' and 'twitter.com' in url_components[
                'host']:
        extras.append("Referer: https://twitter.com/")

    hdr, res = http.get(url_components['uri'], extras=extras)
    if res == '' and hdr != "":
        # print http error code when things go wrong
        print "%s%s : %s" % (url_components['host'], url_components['uri'],
                             hdr.split('\n')[0])
        return

    res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
    filehash = _hash(res_bytes)
    out_fn = 'data/%s.%s' % (filehash, ext)
    if not os.path.exists(out_fn):
        retry_write(out_fn, res_bytes)

    if os.path.lexists(outname): os.unlink(outname)
    os.symlink('../../data/%s.%s' % (filehash, ext), outname)
Exemplo n.º 7
0
def mirror_twat(twat, args=None):

    if 'owner' in twat:
        user = twat['owner'].lower()
    else:
        user = twat['user'].lower()

    if not os.path.isdir('data'): retry_makedirs('data')

    ## soupify user's text
    soup = soupify(twat["text"])

    ## try to automatically mirror links posted by the user,
    ## if it matches the extension list.

    if 'c' in args.mirror and 'curl' in twat:
        url = twat['curl']
        # XXX: unsupported nitter feature
        # this displays fine when loading from twitter in a regular browser,
        # which is probably converted using some js code
        # TODO: check if nitter handles card:// stuff..
        unsuported_shemes = ['card://']
        for _us in unsuported_shemes:
            if url.startswith(_us): continue
            url_components = _split_url(url)
            url_components['filename'] = 'card.html'  #% twat['id']
            _mirror_file(url_components, user, twat['id'], args)

    if 'f' in args.mirror:
        for a in soup.body.find_all('a'):
            if 'data-expanded-url' in a.attrs:
                url_components = _split_url(a.attrs['data-expanded-url'])

                if 'filename' in url_components:
                    _mirror_file(url_components,
                                 user,
                                 twat['id'],
                                 args,
                                 content_type=True)

    ## mirror videos
    if 'v' in args.mirror and 'video' in twat:
        tid = str(twat['id'])
        url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
        outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
        if not os.path.exists('data/%s.mp4' % tid):
            if args.proxy:
                os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, args.rawproxy, tid, url))
            else:
                os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, tid, url))
        if not os.path.exists('%s' % outname) and os.path.exists(
                'data/%s.mp4' % tid):
            os.symlink('../../data/%s.mp4' % tid, outname)

    ## mirror posted pictures
    if 'images' in twat and 'i' in args.mirror:

        for x in xrange(0, len(twat['images'])):
            i = twat['images'][x]

            if '?format=' in i:
                i = i.split('&')[0]
                fmt = i.split('=')[1]
                i = '%s.%s' % (i.split('?')[0], fmt)

            url_components = _split_url(i)
            if 'filename' in url_components:
                _mirror_file(url_components, user, twat['id'], args)

    ## deal with emojis
    if 'e' in args.mirror:
        for img in soup.body.find_all('img'):
            if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
                src = img.attrs['src']
                src = src.encode('utf-8') if isinstance(src, unicode) else src

                split = src.split('/')
                host = split[2]
                emodir = '/'.join(split[3:len(split) - 1])
                filename = split[-1]
                uri = '%s/%s' % (emodir, filename)

                if not os.path.isdir(emodir):
                    retry_makedirs(emodir)

                if not os.path.exists('%s/%s' % (emodir, filename)):
                    http = RsHttp(host=host,
                                  port=443,
                                  timeout=30,
                                  ssl=True,
                                  keep_alive=True,
                                  follow_redirects=True,
                                  auto_set_cookies=True,
                                  proxies=args.proxy,
                                  user_agent="curl/7.60.0")
                    while not http.connect():
                        # FIXME : what should happen on connect error ?
                        pass
                    hdr, res = http.get('/%s' % uri)
                    res = res.encode('utf-8') if isinstance(res,
                                                            unicode) else res
                    retry_write('%s/%s' % (emodir, filename), res)