Exemplo n.º 1
0
    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = '../TLG_idf_files/' + str(self.num_grams) + 'grams/' + str(
            self.spread) + "_" + "v" + str(self.vary_defn) + "_" + "sw" + str(
                self.stopwords) + "/"

        quarterHours = 0

        filedict = file_dict('../ref_file.txt')
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, 'r')
            num_docs = file.readline()  #first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)
Exemplo n.º 2
0
  def __init__(self,spread,variant_word_order, corpus_filename = None,corpus = None, stopword_filename = None, DEFAULT_IDF = 1.5):
    """Initialize the idf dictionary.  
    
       If a corpus file is supplied, reads the idf dictionary from it, in the
       format of:
         # of total documents
         term: # of documents containing the term

       If a stopword file is specified, reads the stopword list from it, in
       the format of one stopword per line.

       The DEFAULT_IDF value is returned when a query term is not found in the
       idf corpus.
    """
    self.spread = spread
    self.vwo = variant_word_order
    self.num_docs = 0
    self.term_num_docs = {}     # term : num_docs_containing_term
    self.idf_default = DEFAULT_IDF

    if stopword_filename:
      stopword_file = open(stopword_filename, "r")
      self.stopwords = [line.strip() for line in stopword_file]

#we should have it so that you can call create_idf_corpus(subcorpus, spread, etc), and it could make an idf object with
#attributes such as idf_dict, spread, etc. We should also have a create_stopword_file(). Finally, we should have a compare()
#method and a graph() method and a generate_tfidf_scores() method for one author. 

    if corpus:
      self.term_num_docs = corpus
    elif corpus_filename:
      corpus_file = open(corpus_filename, "r")

      # Load number of documents.
      line = corpus_file.readline()
      self.num_docs = int(line.strip())

      # Reads "term:frequency" from each subsequent line in the file.
      for line in corpus_file:
        tokens = rpartition(line, "\t")
        term = tokens[0].strip()
        frequency = int(tokens[2].strip())
        self.term_num_docs[term] = frequency
Exemplo n.º 3
0
    def compile_idf_dict(self):

        print "compiling idf dictionary from individual author files"
        idf_dict = {}

        path = (
            "../TLG_idf_files/"
            + str(self.num_grams)
            + "grams/"
            + str(self.spread)
            + "_"
            + "v"
            + str(self.vary_defn)
            + "_"
            + "sw"
            + str(self.stopwords)
            + "/"
        )

        quarterHours = 0

        filedict = file_dict("../ref_file.txt")
        files = filedict[self.subcorpora]
        for infile in files:
            print "now on: ", infile
            file = open(path + infile, "r")
            num_docs = file.readline()  # first line is number of docs in corpus
            for line in file:
                tokens = rpartition(line, "\t")
                term = tokens[0].strip()
                frequency = int(tokens[2].strip())
                if term in idf_dict:
                    idf_dict[term] += frequency
                else:
                    idf_dict[term] = frequency
            quarterHours = self.check_time(quarterHours)
        self.idf_dict = idf_dict
        print time.clock()
        return len(files)