Python file_dict示例，helper_functions.file_dict Python示例

示例#1

0

显示文件

    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(
            self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(
                self.stopwords) + "/"

        quarterHours = 0

        filedict = file_dict('../ref_file.txt')
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, 'r')
            num_docs = file.readline()  #first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)

示例#2

0

显示文件

文件： idf.py 项目： hoguer/SACTAG

    def __init__(self,
                 file_path,
                 subcorpus,
                 num_grams,
                 variant_word_order,
                 spread,
                 stopword_file=None):
        self.subcorpus = subcorpus
        self.num_grams = num_grams
        self.variant = variant_word_order
        self.spread = spread
        self.stopword_file = stopword_file
        self.stopwords = False

        if stopword_file:
            self.stopwords = True

        self.ref_dict = file_dict('../ref_file.txt')
        self.num_docs = len(self.ref_dict[self.subcorpus])
        self.file_path = file_path

        self.idf_files_path = "../idf_files/" + str(self.num_grams) + 'grams/' + \
            str(self.spread) + "_v" + str(self.variant) + "_sw" + str(self.stopwords) + "/"

        if not os.path.exists(self.idf_files_path):
            os.makedirs(self.idf_files_path)

        self.idf_dict_file = "../" + str(self.num_grams) + "grams/" \
            + self.subcorpus + "_v" + str(self.variant) + "_sw"\
            + str(self.stopwords) + "_s" + str(self.spread) + ".txt"

        if not os.path.exists(idf_dict_file):
            self.create_idf_dictionary()

示例#3

0

显示文件

文件： primary.py 项目： hoguer/SACTAG

    def tf_idf(self, document):
        """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams:
        words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document
        is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least
        one document.
        Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold
        parameter."""

        path = "../stripped_text/"

        idf_path = (
            "../"
            + str(self.num_grams)
            + "grams/"
            + self.subcorpora
            + "_v_"
            + str(self.vary_defn)
            + "_sw_"
            + str(self.stopwords)
            + "_s_"
            + str(self.spread)
            + ".txt"
        )

        if os.path.exists(idf_path):
            # idf corpus already exists
            print "idf corpus exists."

            # create tfidf object with existing corpus
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path, self.idf_dict, self.stopword_file)

            # determine document TLG#### filename:
            filedict = file_dict("../ref_file.txt")
            docFilename = filedict[document][0]

        else:  # idf corpus not yet in existence
            print "creating idf corpus."
            docFilename = self.add_docs(document)
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None, self.idf_dict, self.stopword_file)

        # actually determine tf-idf score for ngrams in document:
        tfidf_list = _tfidf.get_doc_keywords(path + docFilename, self.num_grams)

        # print tfidf scores to .txt and .csv:
        print "print tfidf scores to .txt and .csv"
        if document == self.doc1:
            print "printing doc1 to ", self.doc1_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file)
            print_to_csv_file(self.doc1_tfidf_file, self.num_grams)
        elif document == self.doc2:
            print "printing doc2 to ", self.doc2_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file)
            print_to_csv_file(self.doc2_tfidf_file, self.num_grams)

        ##this should only happen if it's not already saved
        # save idf corpus for later use:
        #        _tfidf.save_corpus_to_file(idf_path)

        return tfidf_list

示例#4

0

显示文件

文件： primary.py 项目： hoguer/SACTAG

    def add_docs(self, document):
        """adds documents to idf corpus. returns the name of the file for document
        whose ngram tf-idf scores are to be determined."""

        filedict = file_dict("../ref_file.txt")
        path = "../stripped_text/"

        TLG_files = filedict[self.subcorpora]

        if filedict[document][0] not in TLG_files:
            TLG_files.append(filedict[document][0])

        f = open("curr_test.txt", "w")
        f.write(
            "spread\t"
            + str(self.spread)
            + "\nvariant\t"
            + str(self.vary_defn)
            + "\nstopwords\t"
            + str(self.stopwords)
            + "\nstopword_file\t"
            + self.stopword_file
            + "\nnum_grams\t"
            + str(self.num_grams)
            + "\n$num_docs$\t"
            + str(len(TLG_files))
        )
        f.close()
        setup(TLG_files)
        print "going to master"
        os.system("./master.sh")

        #        file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"
        #        num_files = len(os.listdir(file_path))

        #        while len(TLG_files) != num_files:
        #            time.sleep(60)
        #            num_files = len(os.listdir(file_path))

        self.idf_dict = self.compile_idf_dict()
        # compile_TLG_idfs()
        # Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict.

        #        print "docs: ", tfidf_instance.get_num_docs()
        #        for filename in filedict[self.subcorpora]:
        #            print "adding ", filename
        #            tfidf_instance.add_input_document(path + filename, self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        # Check that document is also in the idf corpus
        #        if filedict[document][0] not in filedict[self.subcorpora]:
        #            print "document not in subcorpora."
        #            print "adding it"
        #            tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        return filedict[document][0]

示例#5

0

显示文件

    def tf_idf(self, document):
        """Returns the tf-idf score for ngrams for the document. The num_grams specifies the level of grams:
        words, bigrams, trigrams, etc. The function creates an idf corpus for the subcorpora. If the document
        is not in the subcorpora, it is also added to the idf corpus, so that each word appears in at least
        one document.
        Stopwords are optional, but if True, they are created based on the stopword_percentage_threshold
        parameter."""

        path = '../stripped_text/'

        idf_path = "../" + str(
            self.num_grams) + "grams/" + self.subcorpora + "_v_" + str(
                self.vary_defn) + "_sw_" + str(self.stopwords) + "_s_" + str(
                    self.spread) + ".txt"

        if os.path.exists(idf_path):
            #idf corpus already exists
            print "idf corpus exists."

            #create tfidf object with existing corpus
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, idf_path,
                                 self.idf_dict, self.stopword_file)

            #determine document TLG#### filename:
            filedict = file_dict('../ref_file.txt')
            docFilename = filedict[document][0]

        else:  #idf corpus not yet in existence
            print "creating idf corpus."
            docFilename = self.add_docs(document)
            _tfidf = tfidf.TfIdf(self.spread, self.vary_defn, None,
                                 self.idf_dict, self.stopword_file)

        #actually determine tf-idf score for ngrams in document:
        tfidf_list = _tfidf.get_doc_keywords(path + docFilename,
                                             self.num_grams)

        #print tfidf scores to .txt and .csv:
        print "print tfidf scores to .txt and .csv"
        if document == self.doc1:
            print "printing doc1 to ", self.doc1_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc1_tfidf_file)
            print_to_csv_file(self.doc1_tfidf_file, self.num_grams)
        elif document == self.doc2:
            print "printing doc2 to ", self.doc2_tfidf_file
            print_to_file_by_ngram(tfidf_list, self.doc2_tfidf_file)
            print_to_csv_file(self.doc2_tfidf_file, self.num_grams)

##this should only happen if it's not already saved
#save idf corpus for later use:
#        _tfidf.save_corpus_to_file(idf_path)

        return tfidf_list

示例#6

0

显示文件

文件： primary_back.py 项目： hoguer/SACTAG

    def create_stopword_file(self,subcorpora, stopword_percentage_threshold):
        """Creates a stopword file. Returns stopword filename."""

        _tfidf = tfidf.TfIdf()
        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'
        for filename in filedict[self.subcorpora]:
            print filename
            _tfidf.add_input_document(path + filename)
        print str(stopword_percentage_threshold)
        _tfidf.save_corpus_to_file("../1grams/" + self.subcorpora + ".txt",
                               "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) +  ".txt",
                               stopword_percentage_threshold)

        return "../stopwords/" + self.subcorpora + "_" + str(stopword_percentage_threshold) +  ".txt" #returns stopword filename

示例#7

0

显示文件

文件： primary_back.py 项目： hoguer/SACTAG

    def add_docs(self, document):
        """adds documents to idf corpus. returns the name of the file for document
        whose ngram tf-idf scores are to be determined."""

        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'

        TLG_files = filedict[self.subcorpora]

        if filedict[document][0] not in TLG_files:
            TLG_files.append(filedict[document][0])

        f = open("curr_test.txt", 'w')
        f.write("spread\t" + str(self.spread) + "\nvariant\t" +
                str(self.vary_defn) + "\nstopwords\t" + str(self.stopwords) +
                "\nstopword_file\t" + self.stopword_file + "\nnum_grams\t" +
                str(self.num_grams) + "\n$num_docs$\t" + str(len(TLG_files)))
        f.close()
        setup(TLG_files)
        print "going to master"
        os.system('./master.sh')

        #        file_path = '../TLG_idf_files/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"
        #        num_files = len(os.listdir(file_path))

        #        while len(TLG_files) != num_files:
        #            time.sleep(60)
        #            num_files = len(os.listdir(file_path))

        self.idf_dict = self.compile_idf_dict()
        #compile_TLG_idfs()
        #Change the tfidf() function that calls this one, so that it then uses the created dict for its idf dict.

        #        print "docs: ", tfidf_instance.get_num_docs()
        #        for filename in filedict[self.subcorpora]:
        #            print "adding ", filename
        #            tfidf_instance.add_input_document(path + filename, self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        #Check that document is also in the idf corpus
        #        if filedict[document][0] not in filedict[self.subcorpora]:
        #            print "document not in subcorpora."
        #            print "adding it"
        #            tfidf_instance.add_input_document(path + filedict[document][0], self.num_grams)
        #            print "docs: ", tfidf_instance.get_num_docs()

        return filedict[document][0]

示例#8

0

显示文件

文件： idf.py 项目： hoguer/SACTAG

    def __init__(self, file_path, subcorpus, num_grams, variant_word_order, spread, stopword_file=None):
        self.subcorpus = subcorpus
        self.num_grams = num_grams
        self.variant = variant_word_order
        self.spread = spread
        self.stopword_file = stopword_file
        self.stopwords = False

        if stopword_file:
            self.stopwords = True

        self.ref_dict = file_dict("../ref_file.txt")
        self.num_docs = len(self.ref_dict[self.subcorpus])
        self.file_path = file_path

        self.idf_files_path = (
            "../idf_files/"
            + str(self.num_grams)
            + "grams/"
            + str(self.spread)
            + "_v"
            + str(self.variant)
            + "_sw"
            + str(self.stopwords)
            + "/"
        )

        if not os.path.exists(self.idf_files_path):
            os.makedirs(self.idf_files_path)

        self.idf_dict_file = (
            "../"
            + str(self.num_grams)
            + "grams/"
            + self.subcorpus
            + "_v"
            + str(self.variant)
            + "_sw"
            + str(self.stopwords)
            + "_s"
            + str(self.spread)
            + ".txt"
        )

        if not os.path.exists(idf_dict_file):
            self.create_idf_dictionary()

示例#9

0

显示文件

文件： primary_back.py 项目： hoguer/SACTAG

    def create_stopword_file(self, subcorpora, stopword_percentage_threshold):
        """Creates a stopword file. Returns stopword filename."""

        _tfidf = tfidf.TfIdf()
        filedict = file_dict('../ref_file.txt')
        path = '../stripped_text/'
        for filename in filedict[self.subcorpora]:
            print filename
            _tfidf.add_input_document(path + filename)
        print str(stopword_percentage_threshold)
        _tfidf.save_corpus_to_file(
            "../1grams/" + self.subcorpora + ".txt",
            "../stopwords/" + self.subcorpora + "_" +
            str(stopword_percentage_threshold) + ".txt",
            stopword_percentage_threshold)

        return "../stopwords/" + self.subcorpora + "_" + str(
            stopword_percentage_threshold) + ".txt"  #returns stopword filename

示例#10

0

显示文件

文件： stopwords.py 项目： hoguer/SACTAG

def create_stopword_file(subcorpus, stopword_percentage_threshold, path_to_files, stopword_filename="stopwords.txt"):
    """Creates a stopword file. Returns stopword list"""

    #ref_file.txt contains the names and corresponding TLG files for authors and subcorpora
    filedict = file_dict('../test_ref.txt')

    # stores {word --> occurence} 
    stopwords = {}
        
    for filename in filedict[subcorpus]:
        stopwords = add_file_to_stopwords(path_to_files + filename, stopwords)

    num_docs = len(filedict[subcorpus])

    save_file(stopword_filename, stopwords, num_docs, stopword_percentage_threshold)

    print "created stopword file: ", stopword_filename 

    return stopwords

示例#11

0

显示文件

文件： primary.py 项目： hoguer/SACTAG

    def compile_idf_dict(self):
        print "compiling idf dictionary from individual author files"
        idf_dict = {}
        
        path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(self.stopwords) + "/"

        filedict = file_dict('../ref_file.txt')        
        files = filedict[self.subcorpora]
        for infile in files:
            file = open(path + infile, 'r')
            num_docs = file.readline() #first line is number of docs in corpus
            for line in file:
                tokens = line.rpartition("\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
        self.idf_dict = idf_dict
        return len(files)

示例#12

0

显示文件

文件： primary.py 项目： hoguer/SACTAG

    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = (
            "../TLG_idf_files/"
            + str(self.num_grams)
            + "grams/"
            + str(self.spread)
            + "_"
            + "v"
            + str(self.vary_defn)
            + "_"
            + "sw"
            + str(self.stopwords)
            + "/"
        )

        quarterHours = 0

        filedict = file_dict("../ref_file.txt")
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, "r")
            num_docs = file.readline()  # first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)

示例#13

0

显示文件

文件： stopwords.py 项目： hoguer/SACTAG

def create_stopword_file(subcorpus,
                         stopword_percentage_threshold,
                         path_to_files,
                         stopword_filename="stopwords.txt"):
    """Creates a stopword file. Returns stopword list"""

    #ref_file.txt contains the names and corresponding TLG files for authors and subcorpora
    filedict = file_dict('../test_ref.txt')

    # stores {word --> occurence}
    stopwords = {}

    for filename in filedict[subcorpus]:
        stopwords = add_file_to_stopwords(path_to_files + filename, stopwords)

    num_docs = len(filedict[subcorpus])

    save_file(stopword_filename, stopwords, num_docs,
              stopword_percentage_threshold)

    print "created stopword file: ", stopword_filename

    return stopwords

示例#14

0

显示文件

文件： _prelim.py 项目： hoguer/SACTAG

import os, sys
from helper_functions import file_dict

if len(sys.argv) != 7:  # the program name and the two arguments
    # stop the program and print an error message
    sys.exit("Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False")

f = open("curr_test.txt", "w")
f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" + str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6])

filedict = file_dict('../ref_file.txt')
TLG_files = filedict[sys.argv[3]]

#delete old contents of temp_in folder
folder = "../temp_in"
for file in os.listdir(folder):
    file_path = os.path.join(folder, file)
    try:
        os.unlink(file_path)
    except Exception, e:
        print e

#create symbolic links to TLG_files in temp_in
for i in range(len(TLG_files)):
    os.symlink("../stripped_text" + TLG_files[i], folder + "/file" + str(i+1))

示例#15

0

显示文件

文件： _prelim.py 项目： hoguer/SACTAG

import os, sys
from helper_functions import file_dict

if len(sys.argv) != 7:  # the program name and the two arguments
    # stop the program and print an error message
    sys.exit(
        "Must provide two authors, a subcorpora, an integer spread, num_grams, and variant word order as True or False"
    )

f = open("curr_test.txt", "w")
f.write(sys.argv[1] + "\n" + sys.argv[2] + "\n" + sys.argv[3] + "\n" +
        str(sys.argv[4]) + "\n" + str(sys.argv[5]) + "\n" + sys.argv[6])

filedict = file_dict('../ref_file.txt')
TLG_files = filedict[sys.argv[3]]

#delete old contents of temp_in folder
folder = "../temp_in"
for file in os.listdir(folder):
    file_path = os.path.join(folder, file)
    try:
        os.unlink(file_path)
    except Exception, e:
        print e

#create symbolic links to TLG_files in temp_in
for i in range(len(TLG_files)):
    os.symlink("../stripped_text" + TLG_files[i],
               folder + "/file" + str(i + 1))