def batch_classify_gold(root_path, tfidfs_per_doc, idfs, lem_flag=False):
	'''
	Classify all articles under the given categorized folder.
	-Return: a dictionary whose keys=article names and values=tuple of
	(suggested cateogry, real category, dict(similarity scores with categories)
	for each article. 
	'''
	class_results = defaultdict(defaultdict)

	for category_folder in corpus_os.get_items_in_folder(root_path):
		# Get articles in directory
		category_path = os.path.join(root_path, category_folder)
		category_files = corpus_os.get_items_in_folder(category_path)

		for article_file in category_files:
			# Classify article
			st_time = time.time()

			article_path = os.path.join(category_path, article_file)
			match, all_scores = classify_article_file( \
				article_path, tfidfs_per_doc, idfs, lem_flag)

			#print "%s\t%s\t%s\t(%.3e)\ttime: %.3f sec" % \
			#	( article_path.split('/')[-1].replace('.txt',''), \
			#	match, category_folder, match[1], \
			#	time.time()-st_time )

			class_results[article_file] = \
			(match, category_folder, all_scores)

	return class_results
def create_corpus_files(corpus_root, corpus_name=None, lem_flag=False):
	'''
	Create a file for each category (=sub-folder), containing clean tokens
	for all documents in that category.
	-Input: path of corpus root, name of sub-folder to create and place files
	 in, flag that indicates whether words should be lemmatized
	-Output: a file for each category, named <categoty_name>.txt, located in
	 a folder named corpus_name under corpus_root (or right under it if None)
	-Return: number of categories processed.
	'''
	st_time = time.time()
	
	categories = corpus_os.get_items_in_folder(corpus_root)
	
	clean_root = os.path.join(corpus_root, corpus_name)
	try:
		os.mkdir(clean_root)
	except OSError:
		# alredy exists, delete and recreate
		shutil.rmtree(clean_root)
		os.mkdir(clean_root)
	print "\n",clean_root
	
	for i, category in enumerate(categories):
		create_category_file(corpus_root, category, clean_root, lem_flag)
	
	num_cats = i+1
	
	print "\nCreated %d files under %s" % (num_cats, clean_root)
	print time.time()-st_time, "seconds\n"
		
	return num_cats, clean_root
def tag_wiki_corpus(root_path):
	'''
	Create a POS-tagged file for each folder under root_path, which contains all
	words from all files in that folder. The files are stored in a directory
	named _pos_ under root_path, and have a .pos extension.
	'''
	
	# Create direcotry and get a list of all folders to tag, removing
	# hidden files
	try:
		os.makedirs(root_path+"_pos_")
	except OSError:
		pass # directory already exists
	
	categories = corpus_os.get_items_in_folder(root_path)
	
	for category in categories:
		category_root = root_path+category
		print 
		print category_root
		category_corpus = ct.corpus_from_directory(category_root, '.*\.txt')
		print "Created corpus %s, tagging..." % category_root
	
		# Tag POS, remove proper nouns and other irrelevant POS, and store in
		# a file for future use
		st_time = time.time()
		tagged_words = tag_pos_corpus(category_corpus) 
		print ">>> Time to tag:", time.time()-st_time
		
		pos_tagged_file = root_path+"_pos_/"+category+".pos"
		write_pos_to_file(tagged_words, pos_tagged_file)
		print ">>> Created file %s" % (pos_tagged_file)
def create_category_file(root_path, category_name, clean_root_path=None, lem_flag=True, decap_flag=True):
	'''
	Create a single file containing clean tokens for all documents in the 
	passed folder. Assume the folder does not contain any sub-folders.
	-Input: path of corpus root, name of category (=folder in corpus), and
	 path for storing the clean files (default is root_path)
	-Output: file named <categoty_name>.txt in the corpus root
	-Return: number of documents processed.
	'''
	if clean_root_path==None:
		clean_root_path=root_path
	
	category_path = os.path.join(root_path, category_name)
	document_list = corpus_os.get_items_in_folder(category_path)
	print document_list
	
	folder_terms = []
	
	# Get all terms in folder (repetitions must be preserved!)
	for i, doc in enumerate(document_list):
		fin = codecs.open(os.path.join(category_path, doc), 'rU')
		doc_text = fin.read()
		folder_terms.extend(get_clean_terms(doc_text, lemmatize=lem_flag, decap=decap_flag))
		# Add EOL to distinguish between text from different articles
		folder_terms.append('\n')
	
	num_docs = i+1
		
	# Concatenate and write to file
	folder_text_clean = ' '.join(folder_terms)
	
	clean_file_path = os.path.join(clean_root_path, category_name+'.txt' )
	fout = codecs.open(clean_file_path, 'w')
	fout.write(folder_text_clean) # There's already an EOL at the end
	fout.close()
	
	return num_docs
def create_corpus_files_separate(corpus_root, corpus_name=None, lem_flag=True, decap_flag=True, 
								language='english', make_new_dir=False, stopwords_file=None, direction=1,
								article_list_file=None):
	'''
	Create a file for each article with only clean tokens.
	-Input: path of corpus root, name of sub-folder to create and place files
	 in, flag that indicates whether words should be lemmatized
	-Output: a file for each article with the original name, located in
	 a folder named corpus_name under corpus_root (or right under it if None)
	-Return: number of articles processed.
	'''
	st_time = time.time()
	
	if article_list_file == None:
		articles = corpus_os.get_items_in_folder(corpus_root)
	else:
		f = open(article_list_file, 'r')
		lines = f.read().split('\n')
		if language == 'english':
			articles = map(lambda line: line.split('\t')[0], lines)
		elif language=='spanish':
			articles = map(lambda line: line.split('\t')[1], lines)
		f.close()
		print 'Number of articles in ' +  article_list_file + ':', len(articles)

	if make_new_dir:
		clean_root = corpus_name
	else:
		clean_root = os.path.join(corpus_root, corpus_name)
        if not os.path.exists(clean_root):
            os.mkdir(clean_root)
            print "\nCreated folder: ",clean_root
#	except OSError:
#		# alredy exists, delete and recreate
#		shutil.rmtree(clean_root)
#		os.mkdir(clean_root)
	
        print '%d total articles' % len(articles)
        all_indices = range(len(articles))

        if len(articles) > 15000 and len(articles) < 30000:
            # split into two groups
            assert direction == 1 or direction == 2
            num_groups = 2
        elif len(articles) >= 30000 and len(articles) < 70000:
            # 4 groups
            assert direction >= 1 and direction <= 4
            num_groups = 4
        elif len(articles) >= 70000:
            # 6 groups
            assert direction >= 1 and direction <= 6
            num_groups = 6
        print 'processing group %d of %d' % (direction, num_groups)
        indices = izip_longest(*[iter(all_indices)]*(int(math.ceil(float(len(articles))/num_groups))))
        article_indices = all_indices
        for i in range(direction):
            article_indices = indices.next()
        print 'double checking length of final article indices:' + str(len(article_indices))
        for i in article_indices:
                if i == None:
                    continue
                doc = articles[i]
		print doc
		# Get all terms in article (repetitions must be preserved!)
		try:
			fin = codecs.open(os.path.join(corpus_root, doc), 'rU')
		except IOError:
			# If all is well, this should be the folder created by this function.
			# Should be ignored.
			print "Hey, there's a folder here! If it's the same as the folder created above, just ignore this message"
			print "%s%s" % (corpus_root, doc)

		doc_text = fin.read()
		# clean them
		article_text_clean =" ".join(get_clean_terms(doc_text, lemmatize=lem_flag, 
													decap=decap_flag, language=language, stopwords_file=stopwords_file))
		# write to a new file
		clean_file_path = os.path.join(clean_root, doc)
		fout = codecs.open(clean_file_path, 'w')
		fout.write(article_text_clean.encode('utf-8')) # There's already an EOL at the end
		fout.close()
	
	num_articles = i+1
	
	print "\nCreated %d files under %s" % (num_articles/num_groups, clean_root)
	print time.time()-st_time, "seconds\n"
		
	return num_articles, clean_root
def create_corpus_files_separate(corpus_root, corpus_name=None, lem_flag=True, decap_flag=True, 
								language='english', make_new_dir=False, stopwords_file=None,
								article_list_file=None):
	'''
	Create a file for each article with only clean tokens.
	-Input: path of corpus root, name of sub-folder to create and place files
	 in, flag that indicates whether words should be lemmatized
	-Output: a file for each article with the original name, located in
	 a folder named corpus_name under corpus_root (or right under it if None)
	-Return: number of articles processed.
	'''
	st_time = time.time()
	
	if article_list_file == None:
		articles = corpus_os.get_items_in_folder(corpus_root)
	else:
		f = open(article_list_file, 'r')
		lines = f.read().split('\n')
		if language == 'english':
			articles = map(lambda line: line.split('\t')[0], lines)
		elif language=='spanish':
			articles = map(lambda line: line.split('\t')[1], lines)
		f.close()
		print 'Number of articles in ' +  article_list_file + ':', len(articles)

	if make_new_dir:
		clean_root = corpus_name
	else:
		clean_root = os.path.join(corpus_root, corpus_name)
	try:
		os.mkdir(clean_root)
	except OSError:
		# alredy exists, delete and recreate
		shutil.rmtree(clean_root)
		os.mkdir(clean_root)
	print "\nCreated folder: ",clean_root
	
	
	for i, doc in enumerate(articles):
		print doc
		# Get all terms in article (repetitions must be preserved!)
		try:
			fin = codecs.open(os.path.join(corpus_root, doc), 'rU')
		except IOError:
			# If all is well, this should be the folder created by this function.
			# Should be ignored.
			print "Hey, there's a folder here! If it's the same as the folder created above, just ignore this message"
			print "%s%s" % (corpus_root, doc)

		doc_text = fin.read()
		# clean them
		article_text_clean =" ".join(get_clean_terms(doc_text, lemmatize=lem_flag, 
													decap=decap_flag, language=language, stopwords_file=stopwords_file))
		# write to a new file
		clean_file_path = os.path.join(clean_root, doc)
		fout = codecs.open(clean_file_path, 'w')
		fout.write(article_text_clean.encode('utf-8')) # There's already an EOL at the end
		fout.close()
	
	num_articles = i+1
	
	print "\nCreated %d files under %s" % (num_articles, clean_root)
	print time.time()-st_time, "seconds\n"
		
	return num_articles, clean_root