def compile_idf_dict(self): print "compiling idf dictionary from individual author files" idf_dict = {} path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str( self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str( self.stopwords) + "/" quarterHours = 0 filedict = file_dict('../ref_file.txt') files = filedict[self.subcorpora] for infile in files: print "now on: ", infile file = open(path + infile, 'r') num_docs = file.readline() #first line is number of docs in corpus for line in file: tokens = rpartition(line, "\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) if term in idf_dict: idf_dict[term] += frequency else: idf_dict[term] = frequency quarterHours = self.check_time(quarterHours) self.idf_dict = idf_dict print time.clock() return len(files)
def __init__(self, file_path, subcorpus, num_grams, variant_word_order, spread, stopword_file=None): self.subcorpus = subcorpus self.num_grams = num_grams self.variant = variant_word_order self.spread = spread self.stopword_file = stopword_file self.stopwords = False if stopword_file: self.stopwords = True self.ref_dict = file_dict('../ref_file.txt') self.num_docs = len(self.ref_dict[self.subcorpus]) self.file_path = file_path self.idf_files_path = "../idf_files/" + str(self.num_grams) + 'grams/' + \ str(self.spread) + "_v" + str(self.variant) + "_sw" + str(self.stopwords) + "/" if not os.path.exists(self.idf_files_path): os.makedirs(self.idf_files_path) self.idf_dict_file = "../" + str(self.num_grams) + "grams/" \ + self.subcorpus + "_v" + str(self.variant) + "_sw"\ + str(self.stopwords) + "_s" + str(self.spread) + ".txt" if not os.path.exists(idf_dict_file): self.create_idf_dictionary()
def tf_idf(self, document): """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams: words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least one document. Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold parameter.""" path = "../stripped_text/" idf_path = ( "../" + str(self.num_grams) + "grams/" + self.subcorpora + "_v_" + str(self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str(self.spread) + ".txt" ) if os.path.exists(idf_path): # idf corpus already exists print "idf corpus exists." # create tfidf object with existing corpus _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path, self.idf_dict, self.stopword_file) # determine document TLG#### filename: filedict = file_dict("../ref_file.txt") docFilename = filedict[document][0] else: # idf corpus not yet in existence print "creating idf corpus." docFilename = self.add_docs(document) _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file) # actually determine tf-idf score for ngrams in document: tfidf_list = _tfidf.get_doc_keywords(path + docFilename, self.num_grams) # print tfidf scores to .txt and .csv: print "print tfidf scores to .txt and .csv" if document == self.doc1: print "printing doc1 to ", self.doc1_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file) print_to_csv_file(self.doc1_tfidf_file, self.num_grams) elif document == self.doc2: print "printing doc2 to ", self.doc2_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file) print_to_csv_file(self.doc2_tfidf_file, self.num_grams) ##this should only happen if it's not already saved # save idf corpus for later use: # _tfidf.save_corpus_to_file(idf_path) return tfidf_list
def add_docs(self, document): """adds documents to idf corpus. returns the name of the file for document whose ngram tf-idf scores are to be determined.""" filedict = file_dict("../ref_file.txt") path = "../stripped_text/" TLG_files = filedict[self.subcorpora] if filedict[document][0] not in TLG_files: TLG_files.append(filedict[document][0]) f = open("curr_test.txt", "w") f.write( "spread\t" + str(self.spread) + "\nvariant\t" + str(self.vary_defn) + "\nstopwords\t" + str(self.stopwords) + "\nstopword_file\t" + self.stopword_file + "\nnum_grams\t" + str(self.num_grams) + "\n$num_docs$\t" + str(len(TLG_files)) ) f.close() setup(TLG_files) print "going to master" os.system("./master.sh") # file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/" # num_files = len(os.listdir(file_path)) # while len(TLG_files) != num_files: # time.sleep(60) # num_files = len(os.listdir(file_path)) self.idf_dict = self.compile_idf_dict() # compile_TLG_idfs() # Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict. # print "docs: ", tfidf_instance.get_num_docs() # for filename in filedict[self.subcorpora]: # print "adding ", filename # tfidf_instance.add_input_document(path + filename, self.num_grams) # print "docs: ", tfidf_instance.get_num_docs() # Check that document is also in the idf corpus # if filedict[document][0] not in filedict[self.subcorpora]: # print "document not in subcorpora." # print "adding it" # tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams) # print "docs: ", tfidf_instance.get_num_docs() return filedict[document][0]
def tf_idf(self, document): """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams: words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least one document. Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold parameter.""" path = '../stripped_text/' idf_path = "../" + str( self.num_grams) + "grams/" + self.subcorpora + "_v_" + str( self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str( self.spread) + ".txt" if os.path.exists(idf_path): #idf corpus already exists print "idf corpus exists." #create tfidf object with existing corpus _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path, self.idf_dict, self.stopword_file) #determine document TLG#### filename: filedict = file_dict('../ref_file.txt') docFilename = filedict[document][0] else: #idf corpus not yet in existence print "creating idf corpus." docFilename = self.add_docs(document) _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file) #actually determine tf-idf score for ngrams in document: tfidf_list = _tfidf.get_doc_keywords(path + docFilename, self.num_grams) #print tfidf scores to .txt and .csv: print "print tfidf scores to .txt and .csv" if document == self.doc1: print "printing doc1 to ", self.doc1_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file) print_to_csv_file(self.doc1_tfidf_file, self.num_grams) elif document == self.doc2: print "printing doc2 to ", self.doc2_tfidf_file print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file) print_to_csv_file(self.doc2_tfidf_file, self.num_grams) ##this should only happen if it's not already saved #save idf corpus for later use: # _tfidf.save_corpus_to_file(idf_path) return tfidf_list
def create_stopword_file(self,subcorpora, stopword_percentage_threshold): """Creates a stopword file. Returns stopword filename.""" _tfidf = tfidf.TfIdf() filedict = file_dict('../ref_file.txt') path = '../stripped_text/' for filename in filedict[self.subcorpora]: print filename _tfidf.add_input_document(path + filename) print str(stopword_percentage_threshold) _tfidf.save_corpus_to_file("../1grams/" + self.subcorpora + ".txt", "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) + ".txt", stopword_percentage_threshold) return "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) + ".txt" #returns stopword filename
def add_docs(self, document): """adds documents to idf corpus. returns the name of the file for document whose ngram tf-idf scores are to be determined.""" filedict = file_dict('../ref_file.txt') path = '../stripped_text/' TLG_files = filedict[self.subcorpora] if filedict[document][0] not in TLG_files: TLG_files.append(filedict[document][0]) f = open("curr_test.txt", 'w') f.write("spread\t" + str(self.spread) + "\nvariant\t" + str(self.vary_defn) + "\nstopwords\t" + str(self.stopwords) + "\nstopword_file\t" + self.stopword_file + "\nnum_grams\t" + str(self.num_grams) + "\n$num_docs$\t" + str(len(TLG_files))) f.close() setup(TLG_files) print "going to master" os.system('./master.sh') # file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/" # num_files = len(os.listdir(file_path)) # while len(TLG_files) != num_files: # time.sleep(60) # num_files = len(os.listdir(file_path)) self.idf_dict = self.compile_idf_dict() #compile_TLG_idfs() #Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict. # print "docs: ", tfidf_instance.get_num_docs() # for filename in filedict[self.subcorpora]: # print "adding ", filename # tfidf_instance.add_input_document(path + filename, self.num_grams) # print "docs: ", tfidf_instance.get_num_docs() #Check that document is also in the idf corpus # if filedict[document][0] not in filedict[self.subcorpora]: # print "document not in subcorpora." # print "adding it" # tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams) # print "docs: ", tfidf_instance.get_num_docs() return filedict[document][0]
def __init__(self, file_path, subcorpus, num_grams, variant_word_order, spread, stopword_file=None): self.subcorpus = subcorpus self.num_grams = num_grams self.variant = variant_word_order self.spread = spread self.stopword_file = stopword_file self.stopwords = False if stopword_file: self.stopwords = True self.ref_dict = file_dict("../ref_file.txt") self.num_docs = len(self.ref_dict[self.subcorpus]) self.file_path = file_path self.idf_files_path = ( "../idf_files/" + str(self.num_grams) + "grams/" + str(self.spread) + "_v" + str(self.variant) + "_sw" + str(self.stopwords) + "/" ) if not os.path.exists(self.idf_files_path): os.makedirs(self.idf_files_path) self.idf_dict_file = ( "../" + str(self.num_grams) + "grams/" + self.subcorpus + "_v" + str(self.variant) + "_sw" + str(self.stopwords) + "_s" + str(self.spread) + ".txt" ) if not os.path.exists(idf_dict_file): self.create_idf_dictionary()
def create_stopword_file(self, subcorpora, stopword_percentage_threshold): """Creates a stopword file. Returns stopword filename.""" _tfidf = tfidf.TfIdf() filedict = file_dict('../ref_file.txt') path = '../stripped_text/' for filename in filedict[self.subcorpora]: print filename _tfidf.add_input_document(path + filename) print str(stopword_percentage_threshold) _tfidf.save_corpus_to_file( "../1grams/" + self.subcorpora + ".txt", "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) + ".txt", stopword_percentage_threshold) return "../stopwords/" + self.subcorpora + "_" + str( stopword_percentage_threshold) + ".txt" #returns stopword filename
def create_stopword_file(subcorpus, stopword_percentage_threshold, path_to_files, stopword_filename="stopwords.txt"): """Creates a stopword file. Returns stopword list""" #ref_file.txt contains the names and corresponding TLG files for authors and subcorpora filedict = file_dict('../test_ref.txt') # stores {word --> occurence} stopwords = {} for filename in filedict[subcorpus]: stopwords = add_file_to_stopwords(path_to_files + filename, stopwords) num_docs = len(filedict[subcorpus]) save_file(stopword_filename, stopwords, num_docs, stopword_percentage_threshold) print "created stopword file: ", stopword_filename return stopwords
def compile_idf_dict(self): print "compiling idf dictionary from individual author files" idf_dict = {} path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/" filedict = file_dict('../ref_file.txt') files = filedict[self.subcorpora] for infile in files: file = open(path + infile, 'r') num_docs = file.readline() #first line is number of docs in corpus for line in file: tokens = line.rpartition("\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) if term in idf_dict: idf_dict[term] += frequency else: idf_dict[term] = frequency self.idf_dict = idf_dict return len(files)
def compile_idf_dict(self): print "compiling idf dictionary from individual author files" idf_dict = {} path = ( "../TLG_idf_files/" + str(self.num_grams) + "grams/" + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/" ) quarterHours = 0 filedict = file_dict("../ref_file.txt") files = filedict[self.subcorpora] for infile in files: print "now on: ", infile file = open(path + infile, "r") num_docs = file.readline() # first line is number of docs in corpus for line in file: tokens = rpartition(line, "\t") term = tokens[0].strip() frequency = int(tokens[2].strip()) if term in idf_dict: idf_dict[term] += frequency else: idf_dict[term] = frequency quarterHours = self.check_time(quarterHours) self.idf_dict = idf_dict print time.clock() return len(files)
import os, sys from helper_functions import file_dict if len(sys.argv) != 7: # the program name and the two arguments # stop the program and print an error message sys.exit("Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False") f = open("curr_test.txt", "w") f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" + str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6]) filedict = file_dict('../ref_file.txt') TLG_files = filedict[sys.argv[3]] #delete old contents of temp_in folder folder = "../temp_in" for file in os.listdir(folder): file_path = os.path.join(folder, file) try: os.unlink(file_path) except Exception, e: print e #create symbolic links to TLG_files in temp_in for i in range(len(TLG_files)): os.symlink("../stripped_text" + TLG_files[i], folder + "/file" + str(i+1))
import os, sys from helper_functions import file_dict if len(sys.argv) != 7: # the program name and the two arguments # stop the program and print an error message sys.exit( "Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False" ) f = open("curr_test.txt", "w") f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" + str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6]) filedict = file_dict('../ref_file.txt') TLG_files = filedict[sys.argv[3]] #delete old contents of temp_in folder folder = "../temp_in" for file in os.listdir(folder): file_path = os.path.join(folder, file) try: os.unlink(file_path) except Exception, e: print e #create symbolic links to TLG_files in temp_in for i in range(len(TLG_files)): os.symlink("../stripped_text" + TLG_files[i], folder + "/file" + str(i + 1))