예제 #1
 def process_request(self, request):
     subdomain = get_subdomain(request)
     if subdomain == 'www':
         return HttpResponseRedirect('http://%s%s' % (current_site.domain, request.path))
     if subdomain != '':
         slugified = slugify(ToUnicode(subdomain).lower())
         if subdomain != slugified:
             return HttpResponseRedirect('http://%s.%s/' % (slugified, current_site.domain))
예제 #2
def dump_submission_info(rdb):

	start = time.time()

	# Log useful info
	#logfile = open(os.path.join('..', 'data', 'reddit', 'dump_log.txt'), 'w')
	num_submissions = rdb.num_submissions()
	num_comments = rdb.num_comments()
	counter = 0
	comment_counter = 0
	num_dumps = 0
	num_urls_found = 0
	num_xposts_found = 0
	num_subdomains_found = 0

	# Will store chunks of data to write and cleared at certain intervals
	# to avoid too much I/O.
	data_buffer = []

	# Collect submissions
	for submission in rdb.submission_list():
		# Get year, month, day
		tmstp = submission.get("created")
		if type(tmstp) == float:
			date = datetime.fromtimestamp(tmstp)
		elif type(tmstp) == datetime:
			date = tmstp
			print "WARNING::::different date format found", type(tmstp), tmstp

		subreddit = rdb.submission_belongs_to(submission).get("subreddit_name")

		year = str(date.year)
		month = str(date.month)
		day = str(date.day)

		path = os.path.join(subreddit[0].lower(), subreddit, year, month, day)

		# Extract relevant pieces of information
		submission_id = str(submission.get("_id"))
		submission_title = submission.get("submission_title") or ''
		submission_text = submission.get("submission_text") or ''
		submission_prawid = submission.get("praw_id") or '-1'

		# Create information to write to file
		submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title + " " + submission_text
		url_info = submission.get("url") or ''
		xpost_info = utils.extract_subreddit_xpost(submission_title)
		subdomain_info = utils.get_subdomain(url_info)

		# Get submission's comments
		comments = rdb.comment_list(submission.get("_id"))
		layers = {}
		for comment in comments:
			layer = comment.get("layer")
			comment_str = ' '.join((comment.get("comment_text") + " ").split(','))
				layers[layer] += comment_str
			except KeyError:
				layers[layer] = comment_str
			comment_counter += 1
		layer_nums = layers.keys()
		comments_info = ','.join(map(lambda ln: layers[ln], layer_nums))

		# Create data object and store in buffer
		data = {'submissions': (submission_info + '\n').encode("utf8")}
		if comments_info:
			data['comments'] = (submission_id + '\t' + comments_info + '\n').encode("utf8")
		if url_info:
			data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8")
		if xpost_info:
			data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8")
		if subdomain_info:
			data['subdomains'] = (submission_id + '\t' + subdomain_info + '\n').encode("utf8")

		data_buffer.append((path, data))

		# Write data and clear buffer every <chunk_size> submissions
		if len(data_buffer) >= chunk_size:

			filehandlers = {}
			for path, data in data_buffer:
					# See if file handlers have already been initialized
					fhs = filehandlers[path]
					for fh_type, fh in fhs.iteritems():
						if fh_type in data:
				except KeyError:
					# Otherwise create the file handlers and write to them
					fhs = {'submissions': open_file(path, 'submissions.txt')}
					if data.get('comments'):
						fhs['comments'] = open_file(path, 'comments.txt')
					if data.get('urls'):
						fhs['urls'] = open_file(path, 'urls.txt')
					if data.get('xposts'):
						fhs['xposts'] = open_file(path, 'xposts.txt')
					if data.get('subdomains'):
						fhs['subdomains'] = open_file(path, 'subdomains.txt')
					filehandlers[path] = fhs

			# Now close all file handlers
			for path, fhs in filehandlers.iteritems():
				for fh_type, fh in fhs.iteritems():

			# Clear buffer
			data_buffer = []
			num_dumps += 1

		if counter % log_interval == 0:
			print "Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%"
			print "\t", comment_counter, "comments dumped out of", num_comments, (comment_counter / float(num_comments)) * 100, "%"
			print "\tNumber of URLs found so far:", num_urls_found
			print "\tNumber of xposts found so far:", num_xposts_found
			print "\tNumber of subdomains found so far:", num_subdomains_found
			print "\tNumber of dumps to file so far:", num_dumps
			print "\tTime spent:", (time.time() - start) / 60.0, "minutes"
			print ""
		counter += 1
예제 #3
def dump_wayback_submission_info(rdb):
	start = time.time()

	# Log useful info
	num_submissions = rdb.num_wayback_submissions()
	counter = 0
	num_dumps = 0
	num_urls_found = 0
	num_xposts_found = 0
	num_subdomains_found = 0
	num_domains_found = 0
	num_no_date = 0

	# Will store chunks of data to write, cleared at certain intervals
	# to avoid too much I/O.
	data_buffer = {}

	# Will store chunks of data specifically for comment file, used to
	# scrape later comments.
	comment_file = open(os.path.join('..', 'data', 'reddit', 'wayback_comments_to_scrape.txt'), 'w')
	comment_buffer = []

	# Collect submissions
	for submission in rdb.wayback_submission_list():
		# Get year, month, day
		tmstp = submission.get("created")
		if type(tmstp) == float:
			date = datetime.fromtimestamp(tmstp)
		elif type(tmstp) == datetime:
			date = tmstp
			#print "WARNING::::different date format found", type(tmstp), tmstp
			num_no_date += 1

		if submission.get("comment_url"):
			subreddit = utils.get_subdomain(submission.get("comment_url"))
			if not subreddit:
		# Ignore ones that don't have associated subreddit

		year = str(date.year)
		month = str(date.month)
		day = str(date.day)

		path = os.path.join(subreddit[0].lower(), subreddit, year, month, day)

		# Extract relevant pieces of information
		submission_id = str(submission.get("_id"))
		submission_title = submission.get("submission_title") or ''
		submission_prawid = submission.get("reddit_id") or '-1'

		# Create information to write to file
		submission_info = submission_id + "\t" + submission_prawid + "\t" + submission_title
		url_info = submission.get("url") or ''
		domain_info = submission.get("domain")
		xpost_info = utils.extract_subreddit_xpost(submission_title)
		subdomain_info = utils.get_subdomain(url_info)

		# Create data object and store in buffer
		data = {'submissions': (submission_info + '\n').encode("utf8")}
		if url_info:
			data['urls'] = (submission_id + '\t' + url_info + '\n').encode("utf8")
			num_urls_found += 1
		if xpost_info:
			data['xposts'] = (submission_id + '\t' + xpost_info + '\n').encode("utf8")
			num_xposts_found += 1
		if subdomain_info:
			data['subdomains'] = (submission_id + '\t' + subdomain_info + '\n').encode("utf8")
			num_subdomains_found += 1
		if domain_info:
			data['domains'] = (submission_id + '\t' + domain_info + '\n').encode("utf8")
			num_domains_found += 1

		except KeyError:
			data_buffer[path] = [data]

		# Store information about comments for later scraping
		if submission_prawid != '-1':
			comment_buffer.append((path, submission_prawid))

		# Write data and clear buffer every <chunk_size> submissions
		if len(data_buffer) >= chunk_size:

			print "Start dump:::", num_open_files()

			# Only ever open 5 files at a time
			for path, data in data_buffer.iteritems():
				# Open files associated with this path
				filehandlers = {'submissions': open_file(path, 'submissions.txt'),
								'urls': open_file(path, 'urls.txt'),
								'xposts': open_file(path, 'xposts.txt'),
								'subdomains': open_file(path, 'subdomains.txt'),
								'domains': open_file(path, 'domains.txt')}

				# Write data to files
				for data_item in data:
					if data_item.get('urls'):
					if data_item.get('xposts'):
					if data_item.get('subdomains'):
					if data_item.get('domains'):

				# Now close files

			# Update comment file
			for path, pid in comment_buffer:
				comment_file.write(path + '\t' + pid + '\n')

			# Clear buffer
			data_buffer = {}
			comment_buffer = []
			num_dumps += 1

			print "End dump:::", num_open_files()

		if counter % log_interval == 0:
			print "Wayback Progress:", counter, "submissions dumped out of", num_submissions, (counter / float(num_submissions)) * 100, "%"
			print "\tNumber of URLs found so far:", num_urls_found
			print "\tNumber of xposts found so far:", num_xposts_found
			print "\tNumber of subdomains found so far:", num_subdomains_found
			print "\tNumber of dumps to file so far:", num_dumps
			print "\tTime spent:", (time.time() - start) / 60.0, "minutes"
			print "\tNumber with no date:", num_no_date
			print ""
		counter += 1
