예제 #1
0
def main(argv):
	server_ = 'localhost'
	port_ = 27017
	database_ = 'corpora'
	collection_ = 'reddit'
	outputfile = 'output.json'

	try:
		opts, args = getopt.getopt(argv,'hs:p:d:c:o:')
	except getopt.GetoptError:
		print('usage: ./reposts.py [, <opt> <name>]')
		print('Type ./reposts.py -h for help')
		sys.exit(2)
	if(opts == []):
		print('This program is running with default parameters')
		print('Type ./reposts.py -h for help')
	for opt, arg in opts:
		if opt == '-h':
			print('usage: ./reposts.py [, <opt> <name>]')
			print('-h, Help')
			print('-s, Server name')
			print('-p, Port number')
			print('-d, Database name')
			print('-c, Collection name')
			print('-o, Output file')
			sys.exit()
		elif opt in ('-s'):
			server_ = arg
		elif opt in ('-p'):
			port_ = int(arg)
		elif opt in ('-d'):
			database_ = arg
		elif opt in ('-c'):
			collection_ = arg
		elif opt in ('-o'):
			outputfile = arg

	client = pymongo.MongoClient(server_, port_)
	collection = client[database_][collection_]
	maybe_repost = client[database_]['maybe_repost']

	regex = {"$regex" : "([Rr]epost)", "$options" : "i"}

	cursor = collection.find({'body': regex}, { 'link_id': 1, 'link_author': 1, 'link_title': 1, 'body': 1, 'is_root': 1,
		'subreddit': 1, 'name': 1, 'selftext': 1, 'parent_id': 1, 'created_utc': 1, 'body_html': 1, '_id': 0 }, limit=1000000 )
	cursor.batch_size(1000)

	from_ = 0
	to_ = cursor.count()
	print(to_)

	for i in range(from_, to_):
		if(i % int(to_ / 10) == 0):
			print('checkpoint ' + str(int((i - from_) / int(to_ / 10))))
		try:
			doc = cursor[i]
			if('body' in doc.keys()):
				label.insert_into(maybe_repost, doc)
		except UnicodeDecodeError:
			continue
예제 #2
0
def main(argv):
	server_ = 'localhost'
	port_ = 27017
	database_ = 'corpora'
	collection_r = 'reddit'
	collection_ir = 'is_repost'
	collection_inr = 'is_not_repost'

	client = pymongo.MongoClient(host=server_,port=port_)
	reddit = client[database_][collection_r]
	is_repost = client[database_][collection_ir]
	is_not_repost = client[database_][collection_inr]

	quantity = 308

	if(len(argv) > 0):
		quantity = int(argv[1])

	not_noise = []

	cursor_ir = is_repost.find({}, {'name': 1, '_id': 0})
	for doc in cursor_ir:
		not_noise.append(doc['name'])

	cursor_inr = is_not_repost.find({}, {'name': 1, '_id': 0})
	for doc in cursor_inr:
		not_noise.append(doc['name'])

	cursor_r = reddit.find({}, { 'link_id': 1, 'link_author': 1, 'link_title': 1, 'body': 1, 'is_root': 1,
		'subreddit': 1, 'name': 1, 'selftext': 1, 'parent_id': 1, 'created_utc': 1, 'body_html': 1, '_id': 0 }, limit=5000 )

	cursor_r.batch_size(1000)

	inserted = 0
	count = cursor_r.count()

	for i in range(0, count):
		if(inserted >= quantity):
			break

		try:
			doc = cursor_r[i]
			if('body' in doc.keys()):
				if(doc['name'] not in not_noise):
					label.insert_into(is_not_repost, doc)
					inserted += 1
		except UnicodeDecodeError:
			continue