def batch_classify_gold(root_path, tfidfs_per_doc, idfs, lem_flag=False): ''' Classify all articles under the given categorized folder. -Return: a dictionary whose keys=article names and values=tuple of (suggested cateogry, real category, dict(similarity scores with categories) for each article. ''' class_results = defaultdict(defaultdict) for category_folder in corpus_os.get_items_in_folder(root_path): # Get articles in directory category_path = os.path.join(root_path, category_folder) category_files = corpus_os.get_items_in_folder(category_path) for article_file in category_files: # Classify article st_time = time.time() article_path = os.path.join(category_path, article_file) match, all_scores = classify_article_file( \ article_path, tfidfs_per_doc, idfs, lem_flag) #print "%s\t%s\t%s\t(%.3e)\ttime: %.3f sec" % \ # ( article_path.split('/')[-1].replace('.txt',''), \ # match, category_folder, match[1], \ # time.time()-st_time ) class_results[article_file] = \ (match, category_folder, all_scores) return class_results
def create_corpus_files(corpus_root, corpus_name=None, lem_flag=False): ''' Create a file for each category (=sub-folder), containing clean tokens for all documents in that category. -Input: path of corpus root, name of sub-folder to create and place files in, flag that indicates whether words should be lemmatized -Output: a file for each category, named <categoty_name>.txt, located in a folder named corpus_name under corpus_root (or right under it if None) -Return: number of categories processed. ''' st_time = time.time() categories = corpus_os.get_items_in_folder(corpus_root) clean_root = os.path.join(corpus_root, corpus_name) try: os.mkdir(clean_root) except OSError: # alredy exists, delete and recreate shutil.rmtree(clean_root) os.mkdir(clean_root) print "\n",clean_root for i, category in enumerate(categories): create_category_file(corpus_root, category, clean_root, lem_flag) num_cats = i+1 print "\nCreated %d files under %s" % (num_cats, clean_root) print time.time()-st_time, "seconds\n" return num_cats, clean_root
def tag_wiki_corpus(root_path): ''' Create a POS-tagged file for each folder under root_path, which contains all words from all files in that folder. The files are stored in a directory named _pos_ under root_path, and have a .pos extension. ''' # Create direcotry and get a list of all folders to tag, removing # hidden files try: os.makedirs(root_path+"_pos_") except OSError: pass # directory already exists categories = corpus_os.get_items_in_folder(root_path) for category in categories: category_root = root_path+category print print category_root category_corpus = ct.corpus_from_directory(category_root, '.*\.txt') print "Created corpus %s, tagging..." % category_root # Tag POS, remove proper nouns and other irrelevant POS, and store in # a file for future use st_time = time.time() tagged_words = tag_pos_corpus(category_corpus) print ">>> Time to tag:", time.time()-st_time pos_tagged_file = root_path+"_pos_/"+category+".pos" write_pos_to_file(tagged_words, pos_tagged_file) print ">>> Created file %s" % (pos_tagged_file)
def create_category_file(root_path, category_name, clean_root_path=None, lem_flag=True, decap_flag=True): ''' Create a single file containing clean tokens for all documents in the passed folder. Assume the folder does not contain any sub-folders. -Input: path of corpus root, name of category (=folder in corpus), and path for storing the clean files (default is root_path) -Output: file named <categoty_name>.txt in the corpus root -Return: number of documents processed. ''' if clean_root_path==None: clean_root_path=root_path category_path = os.path.join(root_path, category_name) document_list = corpus_os.get_items_in_folder(category_path) print document_list folder_terms = [] # Get all terms in folder (repetitions must be preserved!) for i, doc in enumerate(document_list): fin = codecs.open(os.path.join(category_path, doc), 'rU') doc_text = fin.read() folder_terms.extend(get_clean_terms(doc_text, lemmatize=lem_flag, decap=decap_flag)) # Add EOL to distinguish between text from different articles folder_terms.append('\n') num_docs = i+1 # Concatenate and write to file folder_text_clean = ' '.join(folder_terms) clean_file_path = os.path.join(clean_root_path, category_name+'.txt' ) fout = codecs.open(clean_file_path, 'w') fout.write(folder_text_clean) # There's already an EOL at the end fout.close() return num_docs
def create_corpus_files_separate(corpus_root, corpus_name=None, lem_flag=True, decap_flag=True, language='english', make_new_dir=False, stopwords_file=None, direction=1, article_list_file=None): ''' Create a file for each article with only clean tokens. -Input: path of corpus root, name of sub-folder to create and place files in, flag that indicates whether words should be lemmatized -Output: a file for each article with the original name, located in a folder named corpus_name under corpus_root (or right under it if None) -Return: number of articles processed. ''' st_time = time.time() if article_list_file == None: articles = corpus_os.get_items_in_folder(corpus_root) else: f = open(article_list_file, 'r') lines = f.read().split('\n') if language == 'english': articles = map(lambda line: line.split('\t')[0], lines) elif language=='spanish': articles = map(lambda line: line.split('\t')[1], lines) f.close() print 'Number of articles in ' + article_list_file + ':', len(articles) if make_new_dir: clean_root = corpus_name else: clean_root = os.path.join(corpus_root, corpus_name) if not os.path.exists(clean_root): os.mkdir(clean_root) print "\nCreated folder: ",clean_root # except OSError: # # alredy exists, delete and recreate # shutil.rmtree(clean_root) # os.mkdir(clean_root) print '%d total articles' % len(articles) all_indices = range(len(articles)) if len(articles) > 15000 and len(articles) < 30000: # split into two groups assert direction == 1 or direction == 2 num_groups = 2 elif len(articles) >= 30000 and len(articles) < 70000: # 4 groups assert direction >= 1 and direction <= 4 num_groups = 4 elif len(articles) >= 70000: # 6 groups assert direction >= 1 and direction <= 6 num_groups = 6 print 'processing group %d of %d' % (direction, num_groups) indices = izip_longest(*[iter(all_indices)]*(int(math.ceil(float(len(articles))/num_groups)))) article_indices = all_indices for i in range(direction): article_indices = indices.next() print 'double checking length of final article indices:' + str(len(article_indices)) for i in article_indices: if i == None: continue doc = articles[i] print doc # Get all terms in article (repetitions must be preserved!) try: fin = codecs.open(os.path.join(corpus_root, doc), 'rU') except IOError: # If all is well, this should be the folder created by this function. # Should be ignored. print "Hey, there's a folder here! If it's the same as the folder created above, just ignore this message" print "%s%s" % (corpus_root, doc) doc_text = fin.read() # clean them article_text_clean =" ".join(get_clean_terms(doc_text, lemmatize=lem_flag, decap=decap_flag, language=language, stopwords_file=stopwords_file)) # write to a new file clean_file_path = os.path.join(clean_root, doc) fout = codecs.open(clean_file_path, 'w') fout.write(article_text_clean.encode('utf-8')) # There's already an EOL at the end fout.close() num_articles = i+1 print "\nCreated %d files under %s" % (num_articles/num_groups, clean_root) print time.time()-st_time, "seconds\n" return num_articles, clean_root
def create_corpus_files_separate(corpus_root, corpus_name=None, lem_flag=True, decap_flag=True, language='english', make_new_dir=False, stopwords_file=None, article_list_file=None): ''' Create a file for each article with only clean tokens. -Input: path of corpus root, name of sub-folder to create and place files in, flag that indicates whether words should be lemmatized -Output: a file for each article with the original name, located in a folder named corpus_name under corpus_root (or right under it if None) -Return: number of articles processed. ''' st_time = time.time() if article_list_file == None: articles = corpus_os.get_items_in_folder(corpus_root) else: f = open(article_list_file, 'r') lines = f.read().split('\n') if language == 'english': articles = map(lambda line: line.split('\t')[0], lines) elif language=='spanish': articles = map(lambda line: line.split('\t')[1], lines) f.close() print 'Number of articles in ' + article_list_file + ':', len(articles) if make_new_dir: clean_root = corpus_name else: clean_root = os.path.join(corpus_root, corpus_name) try: os.mkdir(clean_root) except OSError: # alredy exists, delete and recreate shutil.rmtree(clean_root) os.mkdir(clean_root) print "\nCreated folder: ",clean_root for i, doc in enumerate(articles): print doc # Get all terms in article (repetitions must be preserved!) try: fin = codecs.open(os.path.join(corpus_root, doc), 'rU') except IOError: # If all is well, this should be the folder created by this function. # Should be ignored. print "Hey, there's a folder here! If it's the same as the folder created above, just ignore this message" print "%s%s" % (corpus_root, doc) doc_text = fin.read() # clean them article_text_clean =" ".join(get_clean_terms(doc_text, lemmatize=lem_flag, decap=decap_flag, language=language, stopwords_file=stopwords_file)) # write to a new file clean_file_path = os.path.join(clean_root, doc) fout = codecs.open(clean_file_path, 'w') fout.write(article_text_clean.encode('utf-8')) # There's already an EOL at the end fout.close() num_articles = i+1 print "\nCreated %d files under %s" % (num_articles, clean_root) print time.time()-st_time, "seconds\n" return num_articles, clean_root