Пример #1
0
    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(
            self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(
                self.stopwords) + "/"

        quarterHours = 0

        filedict = file_dict('../ref_file.txt')
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, 'r')
            num_docs = file.readline()  #first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)
Пример #2
0
    def __init__(self,
                 file_path,
                 subcorpus,
                 num_grams,
                 variant_word_order,
                 spread,
                 stopword_file=None):
        self.subcorpus = subcorpus
        self.num_grams = num_grams
        self.variant = variant_word_order
        self.spread = spread
        self.stopword_file = stopword_file
        self.stopwords = False

        if stopword_file:
            self.stopwords = True

        self.ref_dict = file_dict('../ref_file.txt')
        self.num_docs = len(self.ref_dict[self.subcorpus])
        self.file_path = file_path

        self.idf_files_path = "../idf_files/" + str(self.num_grams) + 'grams/' + \
            str(self.spread) + "_v" + str(self.variant) + "_sw" + str(self.stopwords) + "/"

        if not os.path.exists(self.idf_files_path):
            os.makedirs(self.idf_files_path)

        self.idf_dict_file = "../" + str(self.num_grams) + "grams/" \
            + self.subcorpus + "_v" + str(self.variant) + "_sw"\
            + str(self.stopwords) + "_s" + str(self.spread) + ".txt"

        if not os.path.exists(idf_dict_file):
            self.create_idf_dictionary()
Пример #3
0
    def tf_idf(self, document):
        """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams:
        words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document
        is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least
        one document.
        Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold
        parameter."""

        path = "../stripped_text/"

        idf_path = (
            "../"
            + str(self.num_grams)
            + "grams/"
            + self.subcorpora
            + "_v_"
            + str(self.vary_defn)
            + "_sw_"
            + str(self.stopwords)
            + "_s_"
            + str(self.spread)
            + ".txt"
        )

        if os.path.exists(idf_path):
            # idf corpus already exists
            print "idf corpus exists."

            # create tfidf object with existing corpus
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path, self.idf_dict, self.stopword_file)

            # determine document TLG#### filename:
            filedict = file_dict("../ref_file.txt")
            docFilename = filedict[document][0]

        else:  # idf corpus not yet in existence
            print "creating idf corpus."
            docFilename = self.add_docs(document)
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file)

        # actually determine tf-idf score for ngrams in document:
        tfidf_list = _tfidf.get_doc_keywords(path + docFilename, self.num_grams)

        # print tfidf scores to .txt and .csv:
        print "print tfidf scores to .txt and .csv"
        if document == self.doc1:
            print "printing doc1 to ", self.doc1_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file)
            print_to_csv_file(self.doc1_tfidf_file, self.num_grams)
        elif document == self.doc2:
            print "printing doc2 to ", self.doc2_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file)
            print_to_csv_file(self.doc2_tfidf_file, self.num_grams)

        ##this should only happen if it's not already saved
        # save idf corpus for later use:
        #        _tfidf.save_corpus_to_file(idf_path)

        return tfidf_list
Пример #4
0
    def add_docs(self, document):
        """adds documents to idf corpus. returns the name of the file for document
        whose ngram tf-idf scores are to be determined."""

        filedict = file_dict("../ref_file.txt")
        path = "../stripped_text/"

        TLG_files = filedict[self.subcorpora]

        if filedict[document][0] not in TLG_files:
            TLG_files.append(filedict[document][0])

        f = open("curr_test.txt", "w")
        f.write(
            "spread\t"
            + str(self.spread)
            + "\nvariant\t"
            + str(self.vary_defn)
            + "\nstopwords\t"
            + str(self.stopwords)
            + "\nstopword_file\t"
            + self.stopword_file
            + "\nnum_grams\t"
            + str(self.num_grams)
            + "\n$num_docs$\t"
            + str(len(TLG_files))
        )
        f.close()
        setup(TLG_files)
        print "going to master"
        os.system("./master.sh")

        #        file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"
        #        num_files = len(os.listdir(file_path))

        #        while len(TLG_files) != num_files:
        #            time.sleep(60)
        #            num_files = len(os.listdir(file_path))

        self.idf_dict = self.compile_idf_dict()
        # compile_TLG_idfs()
        # Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict.

        #        print "docs: ", tfidf_instance.get_num_docs()
        #        for filename in filedict[self.subcorpora]:
        #            print "adding ", filename
        #            tfidf_instance.add_input_document(path + filename, self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        # Check that document is also in the idf corpus
        #        if filedict[document][0] not in filedict[self.subcorpora]:
        #            print "document not in subcorpora."
        #            print "adding it"
        #            tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        return filedict[document][0]
Пример #5
0
    def tf_idf(self, document):
        """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams:
        words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document
        is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least
        one document.
        Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold
        parameter."""

        path = '../stripped_text/'

        idf_path = "../" + str(
            self.num_grams) + "grams/" + self.subcorpora + "_v_" + str(
                self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str(
                    self.spread) + ".txt"

        if os.path.exists(idf_path):
            #idf corpus already exists
            print "idf corpus exists."

            #create tfidf object with existing corpus
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path,
                                 self.idf_dict, self.stopword_file)

            #determine document TLG#### filename:
            filedict = file_dict('../ref_file.txt')
            docFilename = filedict[document][0]

        else:  #idf corpus not yet in existence
            print "creating idf corpus."
            docFilename = self.add_docs(document)
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None,
                                 self.idf_dict, self.stopword_file)

        #actually determine tf-idf score for ngrams in document:
        tfidf_list = _tfidf.get_doc_keywords(path + docFilename,
                                             self.num_grams)

        #print tfidf scores to .txt and .csv:
        print "print tfidf scores to .txt and .csv"
        if document == self.doc1:
            print "printing doc1 to ", self.doc1_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file)
            print_to_csv_file(self.doc1_tfidf_file, self.num_grams)
        elif document == self.doc2:
            print "printing doc2 to ", self.doc2_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file)
            print_to_csv_file(self.doc2_tfidf_file, self.num_grams)

##this should only happen if it's not already saved
#save idf corpus for later use:
#        _tfidf.save_corpus_to_file(idf_path)

        return tfidf_list
Пример #6
0
    def create_stopword_file(self,subcorpora, stopword_percentage_threshold):
        """Creates a stopword file. Returns stopword filename."""

        _tfidf = tfidf.TfIdf()
        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'
        for filename in filedict[self.subcorpora]:
            print filename
            _tfidf.add_input_document(path + filename)
        print str(stopword_percentage_threshold)
        _tfidf.save_corpus_to_file("../1grams/" + self.subcorpora + ".txt",
                               "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) +  ".txt",
                               stopword_percentage_threshold)

        return "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) +  ".txt" #returns stopword filename
Пример #7
0
    def add_docs(self, document):
        """adds documents to idf corpus. returns the name of the file for document
        whose ngram tf-idf scores are to be determined."""

        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'

        TLG_files = filedict[self.subcorpora]

        if filedict[document][0] not in TLG_files:
            TLG_files.append(filedict[document][0])

        f = open("curr_test.txt", 'w')
        f.write("spread\t" + str(self.spread) + "\nvariant\t" +
                str(self.vary_defn) + "\nstopwords\t" + str(self.stopwords) +
                "\nstopword_file\t" + self.stopword_file + "\nnum_grams\t" +
                str(self.num_grams) + "\n$num_docs$\t" + str(len(TLG_files)))
        f.close()
        setup(TLG_files)
        print "going to master"
        os.system('./master.sh')

        #        file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"
        #        num_files = len(os.listdir(file_path))

        #        while len(TLG_files) != num_files:
        #            time.sleep(60)
        #            num_files = len(os.listdir(file_path))

        self.idf_dict = self.compile_idf_dict()
        #compile_TLG_idfs()
        #Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict.

        #        print "docs: ", tfidf_instance.get_num_docs()
        #        for filename in filedict[self.subcorpora]:
        #            print "adding ", filename
        #            tfidf_instance.add_input_document(path + filename, self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        #Check that document is also in the idf corpus
        #        if filedict[document][0] not in filedict[self.subcorpora]:
        #            print "document not in subcorpora."
        #            print "adding it"
        #            tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        return filedict[document][0]
Пример #8
0
    def __init__(self, file_path, subcorpus, num_grams, variant_word_order, spread, stopword_file=None):
        self.subcorpus = subcorpus
        self.num_grams = num_grams
        self.variant = variant_word_order
        self.spread = spread
        self.stopword_file = stopword_file
        self.stopwords = False

        if stopword_file:
            self.stopwords = True

        self.ref_dict = file_dict("../ref_file.txt")
        self.num_docs = len(self.ref_dict[self.subcorpus])
        self.file_path = file_path

        self.idf_files_path = (
            "../idf_files/"
            + str(self.num_grams)
            + "grams/"
            + str(self.spread)
            + "_v"
            + str(self.variant)
            + "_sw"
            + str(self.stopwords)
            + "/"
        )

        if not os.path.exists(self.idf_files_path):
            os.makedirs(self.idf_files_path)

        self.idf_dict_file = (
            "../"
            + str(self.num_grams)
            + "grams/"
            + self.subcorpus
            + "_v"
            + str(self.variant)
            + "_sw"
            + str(self.stopwords)
            + "_s"
            + str(self.spread)
            + ".txt"
        )

        if not os.path.exists(idf_dict_file):
            self.create_idf_dictionary()
Пример #9
0
    def create_stopword_file(self, subcorpora, stopword_percentage_threshold):
        """Creates a stopword file. Returns stopword filename."""

        _tfidf = tfidf.TfIdf()
        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'
        for filename in filedict[self.subcorpora]:
            print filename
            _tfidf.add_input_document(path + filename)
        print str(stopword_percentage_threshold)
        _tfidf.save_corpus_to_file(
            "../1grams/" + self.subcorpora + ".txt",
            "../stopwords/" + self.subcorpora + "_" +
            str(stopword_percentage_threshold) + ".txt",
            stopword_percentage_threshold)

        return "../stopwords/" + self.subcorpora + "_" + str(
            stopword_percentage_threshold) + ".txt"  #returns stopword filename
Пример #10
0
def create_stopword_file(subcorpus, stopword_percentage_threshold, path_to_files, stopword_filename="stopwords.txt"):
    """Creates a stopword file. Returns stopword list"""

    #ref_file.txt contains the names and corresponding TLG files for authors and subcorpora
    filedict = file_dict('../test_ref.txt')

    # stores {word --> occurence} 
    stopwords = {}
        
    for filename in filedict[subcorpus]:
        stopwords = add_file_to_stopwords(path_to_files + filename, stopwords)

    num_docs = len(filedict[subcorpus])

    save_file(stopword_filename, stopwords, num_docs, stopword_percentage_threshold)

    print "created stopword file: ", stopword_filename 

    return stopwords
Пример #11
0
    def compile_idf_dict(self):
        print "compiling idf dictionary from individual author files"
        idf_dict = {}
        
        path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"

        filedict = file_dict('../ref_file.txt')        
        files = filedict[self.subcorpora]
        for infile in files:
            file = open(path + infile, 'r')
            num_docs = file.readline() #first line is number of docs in corpus
            for line in file:
                tokens = line.rpartition("\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
        self.idf_dict = idf_dict
        return len(files)
Пример #12
0
    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = (
            "../TLG_idf_files/"
            + str(self.num_grams)
            + "grams/"
            + str(self.spread)
            + "_"
            + "v"
            + str(self.vary_defn)
            + "_"
            + "sw"
            + str(self.stopwords)
            + "/"
        )

        quarterHours = 0

        filedict = file_dict("../ref_file.txt")
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, "r")
            num_docs = file.readline()  # first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)
Пример #13
0
def create_stopword_file(subcorpus,
                         stopword_percentage_threshold,
                         path_to_files,
                         stopword_filename="stopwords.txt"):
    """Creates a stopword file. Returns stopword list"""

    #ref_file.txt contains the names and corresponding TLG files for authors and subcorpora
    filedict = file_dict('../test_ref.txt')

    # stores {word --> occurence}
    stopwords = {}

    for filename in filedict[subcorpus]:
        stopwords = add_file_to_stopwords(path_to_files + filename, stopwords)

    num_docs = len(filedict[subcorpus])

    save_file(stopword_filename, stopwords, num_docs,
              stopword_percentage_threshold)

    print "created stopword file: ", stopword_filename

    return stopwords
Пример #14
0
import os, sys
from helper_functions import file_dict

if len(sys.argv) != 7:  # the program name and the two arguments
    # stop the program and print an error message
    sys.exit("Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False")

f = open("curr_test.txt", "w")
f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" + str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6])

filedict = file_dict('../ref_file.txt')
TLG_files = filedict[sys.argv[3]]

#delete old contents of temp_in folder
folder = "../temp_in"
for file in os.listdir(folder):
    file_path = os.path.join(folder, file)
    try:
        os.unlink(file_path)
    except Exception, e:
        print e

#create symbolic links to TLG_files in temp_in
for i in range(len(TLG_files)):
    os.symlink("../stripped_text" + TLG_files[i], folder + "/file" + str(i+1))
Пример #15
0
import os, sys
from helper_functions import file_dict

if len(sys.argv) != 7:  # the program name and the two arguments
    # stop the program and print an error message
    sys.exit(
        "Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False"
    )

f = open("curr_test.txt", "w")
f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" +
        str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6])

filedict = file_dict('../ref_file.txt')
TLG_files = filedict[sys.argv[3]]

#delete old contents of temp_in folder
folder = "../temp_in"
for file in os.listdir(folder):
    file_path = os.path.join(folder, file)
    try:
        os.unlink(file_path)
    except Exception, e:
        print e

#create symbolic links to TLG_files in temp_in
for i in range(len(TLG_files)):
    os.symlink("../stripped_text" + TLG_files[i],
               folder + "/file" + str(i + 1))