def main(argv):
	server_ = 'localhost'
	port_ = 27017
	database_ = 'corpora'
	collection_ = 'reddit'
	outputfile = 'output.json'

	try:
		opts, args = getopt.getopt(argv,'hitrc:o:')
	except getopt.GetoptError:
		print('usage: ./reposts_classifier.py [, <opt> <name>]')
		print('Type ./reposts_classifier.py -h for help')
		sys.exit(2)
	if(opts == []):
		opts.append(('-h', ''))
	for opt, arg in opts:
		if opt == '-h':
			print('usage: ./reposts_classifier.py [, <opt> <name>]')
			print('-h, Help')
			print('-i, Show informative features')
			print('-t, Show test_set errors')
			print('-r, Run classifier on reddit copora')
			print('-c, Change collection name')
			print('-o, Output file')
			sys.exit()
		elif opt in ('-i'):
			option = 'i'
		elif opt in ('-t'):
			option = 't'
		elif opt in ('-r'):
			option = 'r'
		elif opt in ('-c'):
			collection_ = arg
		elif opt in ('-o'):
			outputfile = arg

	client = pymongo.MongoClient(server_, port_)
	collection = client[database_][collection_]

	# random.seed(0)
	docs = nvb.get_reposts()
	random.shuffle(docs)
	featuresets = [(nvb.apply_features(doc), what_is) for (doc,what_is) in docs]
	train_set, test_set = featuresets[0:int(len(featuresets)/2)], featuresets[int(len(featuresets)/2) + 1:]
	classifier = nltk.NaiveBayesClassifier.train(train_set)

	if(option == 'i'):
		show_infos(classifier, test_set)
	elif(option == 't'):
		show_test_errors(classifier, docs, len(train_set))
	elif(option == 'r'):
		result = run_classifier(classifier, collection, 0, 20000)

		f = open(outputfile, 'w')

		f.write(json.dumps(result))
def show_test_errors(classifier, docs, begin):
	errors = []
	for (name, tag) in docs[begin:]:
		guess = classifier.classify(nvb.apply_features(name))
		if guess != tag:
			errors.append( (tag, guess, name) )
	for (tag, guess, name) in errors:
		print('correct=%-8s guess=%-8s' % (tag, guess))

	print(len(errors))
def run_classifier(classifier, collection, skip_=0, limit_=5000):
	cursor = collection.find(skip=skip_, limit=limit_)
	cursor.batch_size(1000)

	result = defaultdict(lambda : defaultdict(dict))

	from_ = 0
	to_ = cursor.count(True)

	print('Classifying ' + str(to_) + ' documents...')
	print()

	for i in range(from_, to_):
		# if(i % int(to_ / 10) == 0):
		# 	print('checkpoint ' + str(int((i - from_) / int(to_ / 10))))
		try:
			doc = cursor[i]
			if('body' in doc.keys()):
				print('Classifying document ' + str(i) + ' with name: ' + doc['name'] + '... ', end='')
				guess = classifier.classify(nvb.apply_features(doc))
				if(guess == 'is_repost'):
					t = dt.datetime.fromtimestamp(int(doc['created_utc']))
					date = str(t.year) + zero(str(t.month)) + zero(str(t.day))
					if(doc['link_author'] in result.keys()):
						if(doc['link_author'][date] in result[doc['link_author']].keys()):
							if(doc['subreddit'] in result[doc['link_author']][date].keys()):
								result[doc['link_author']][date][doc['subreddit']] += 1
							else:
								result[doc['link_author']][date][doc['subreddit']] = 1
						else:
							result[doc['link_author']][date][doc['subreddit']] = 1
					else:
						result[doc['link_author']][date][doc['subreddit']] = 1

				print('done')
		except UnicodeDecodeError:
			continue

	print(str(to_) + ' documents classified!')

	return result